From a9ebb862ebe8e457825e2c2646b7021be687b6ec Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 21 Apr 2017 12:05:00 +0800
Subject: [PATCH 001/189] fix build

---
 guetzli.vcxproj                |  55 +++++++++++-
 guetzli.vcxproj.filters        | 157 ++++++++++++++++++++++++++++++++-
 guetzli.vcxproj.user           |   8 ++
 guetzli/guetzli.cc             |   4 +-
 guetzli_static.vcxproj         |  55 +++++++++++-
 guetzli_static.vcxproj.filters | 157 ++++++++++++++++++++++++++++++++-
 6 files changed, 429 insertions(+), 7 deletions(-)
 create mode 100644 guetzli.vcxproj.user
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 5b7ffeb9..dd49fa15 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -116,7 +116,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -150,7 +150,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
@@ -188,6 +188,20 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
+    <ClInclude Include="third_party\libpng\png.h" />
+    <ClInclude Include="third_party\libpng\pngconf.h" />
+    <ClInclude Include="third_party\libpng\pngpriv.h" />
+    <ClInclude Include="third_party\zlib\crc32.h" />
+    <ClInclude Include="third_party\zlib\deflate.h" />
+    <ClInclude Include="third_party\zlib\gzguts.h" />
+    <ClInclude Include="third_party\zlib\inffast.h" />
+    <ClInclude Include="third_party\zlib\inffixed.h" />
+    <ClInclude Include="third_party\zlib\inflate.h" />
+    <ClInclude Include="third_party\zlib\inftrees.h" />
+    <ClInclude Include="third_party\zlib\trees.h" />
+    <ClInclude Include="third_party\zlib\zconf.h" />
+    <ClInclude Include="third_party\zlib\zlib.h" />
+    <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
@@ -211,6 +225,43 @@
     <ClCompile Include="guetzli\quantize.cc" />
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
+    <ClCompile Include="third_party\libpng\png.c" />
+    <ClCompile Include="third_party\libpng\pngerror.c" />
+    <ClCompile Include="third_party\libpng\pngget.c" />
+    <ClCompile Include="third_party\libpng\pngmem.c" />
+    <ClCompile Include="third_party\libpng\pngpread.c" />
+    <ClCompile Include="third_party\libpng\pngread.c" />
+    <ClCompile Include="third_party\libpng\pngrio.c" />
+    <ClCompile Include="third_party\libpng\pngrtran.c" />
+    <ClCompile Include="third_party\libpng\pngrutil.c" />
+    <ClCompile Include="third_party\libpng\pngset.c" />
+    <ClCompile Include="third_party\libpng\pngtrans.c" />
+    <ClCompile Include="third_party\libpng\pngwio.c" />
+    <ClCompile Include="third_party\libpng\pngwrite.c" />
+    <ClCompile Include="third_party\libpng\pngwtran.c" />
+    <ClCompile Include="third_party\libpng\pngwutil.c" />
+    <ClCompile Include="third_party\zlib\adler32.c" />
+    <ClCompile Include="third_party\zlib\compress.c" />
+    <ClCompile Include="third_party\zlib\crc32.c" />
+    <ClCompile Include="third_party\zlib\deflate.c" />
+    <ClCompile Include="third_party\zlib\gzclose.c" />
+    <ClCompile Include="third_party\zlib\gzlib.c" />
+    <ClCompile Include="third_party\zlib\gzread.c" />
+    <ClCompile Include="third_party\zlib\gzwrite.c" />
+    <ClCompile Include="third_party\zlib\infback.c" />
+    <ClCompile Include="third_party\zlib\inffast.c" />
+    <ClCompile Include="third_party\zlib\inflate.c" />
+    <ClCompile Include="third_party\zlib\inftrees.c" />
+    <ClCompile Include="third_party\zlib\trees.c" />
+    <ClCompile Include="third_party\zlib\uncompr.c" />
+    <ClCompile Include="third_party\zlib\zutil.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def" />
+    <None Include="third_party\zlib\inffas32.asm" />
+    <None Include="third_party\zlib\match32.asm" />
+    <None Include="third_party\zlib\match686.asm" />
+    <None Include="third_party\zlib\zlib.def" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index da2297c5..be2fe5a3 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="guetzli">
@@ -13,6 +13,12 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
+    <Filter Include="third_party\libpng">
+      <UniqueIdentifier>{40be58d6-6dfc-45a3-8ca1-7d1b14051ddc}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="third_party\zlib">
+      <UniqueIdentifier>{cb89c1ac-8399-4814-88f2-4b69576bc9f9}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -93,6 +99,48 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
+    <ClInclude Include="third_party\libpng\png.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngconf.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngpriv.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\crc32.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\deflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\gzguts.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffast.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffixed.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inftrees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\trees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zconf.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zlib.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zutil.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -158,5 +206,112 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
+    <ClCompile Include="third_party\libpng\png.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngerror.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngget.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngmem.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngpread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngset.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngtrans.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwrite.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\adler32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\compress.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\crc32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\deflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzclose.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzlib.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzread.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzwrite.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\infback.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inffast.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inftrees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\trees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\uncompr.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\zutil.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def">
+      <Filter>third_party\libpng</Filter>
+    </None>
+    <None Include="third_party\zlib\inffas32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match686.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\zlib.def">
+      <Filter>third_party\zlib</Filter>
+    </None>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli.vcxproj.user b/guetzli.vcxproj.user
new file mode 100644
index 00000000..da467b73
--- /dev/null
+++ b/guetzli.vcxproj.user
@@ -0,0 +1,8 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LocalDebuggerCommandArguments>test.jpg out.jpg</LocalDebuggerCommandArguments>
+    <LocalDebuggerWorkingDirectory>$(OutDir)</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index d4e282b8..85cd4bb7 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -164,7 +164,9 @@ std::string ReadFileOrDie(const char* filename) {
   off_t buffer_size = 8192;
 
   if (fseek(f, 0, SEEK_END) == 0) {
-    buffer_size = std::max<off_t>(ftell(f), 1);
+//    buffer_size = std::max<off_t>(ftell(f), 1);
+	  long size = ftell(f);
+	  buffer_size = size > 0 ? size : 1;
     if (fseek(f, 0, SEEK_SET) != 0) {
       perror("fseek");
       exit(1);
diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj
index 02e6b436..5d9dd9cd 100644
--- a/guetzli_static.vcxproj
+++ b/guetzli_static.vcxproj
@@ -110,7 +110,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -140,7 +140,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
@@ -176,6 +176,20 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
+    <ClInclude Include="third_party\libpng\png.h" />
+    <ClInclude Include="third_party\libpng\pngconf.h" />
+    <ClInclude Include="third_party\libpng\pngpriv.h" />
+    <ClInclude Include="third_party\zlib\crc32.h" />
+    <ClInclude Include="third_party\zlib\deflate.h" />
+    <ClInclude Include="third_party\zlib\gzguts.h" />
+    <ClInclude Include="third_party\zlib\inffast.h" />
+    <ClInclude Include="third_party\zlib\inffixed.h" />
+    <ClInclude Include="third_party\zlib\inflate.h" />
+    <ClInclude Include="third_party\zlib\inftrees.h" />
+    <ClInclude Include="third_party\zlib\trees.h" />
+    <ClInclude Include="third_party\zlib\zconf.h" />
+    <ClInclude Include="third_party\zlib\zlib.h" />
+    <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
@@ -198,6 +212,43 @@
     <ClCompile Include="guetzli\quantize.cc" />
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
+    <ClCompile Include="third_party\libpng\png.c" />
+    <ClCompile Include="third_party\libpng\pngerror.c" />
+    <ClCompile Include="third_party\libpng\pngget.c" />
+    <ClCompile Include="third_party\libpng\pngmem.c" />
+    <ClCompile Include="third_party\libpng\pngpread.c" />
+    <ClCompile Include="third_party\libpng\pngread.c" />
+    <ClCompile Include="third_party\libpng\pngrio.c" />
+    <ClCompile Include="third_party\libpng\pngrtran.c" />
+    <ClCompile Include="third_party\libpng\pngrutil.c" />
+    <ClCompile Include="third_party\libpng\pngset.c" />
+    <ClCompile Include="third_party\libpng\pngtrans.c" />
+    <ClCompile Include="third_party\libpng\pngwio.c" />
+    <ClCompile Include="third_party\libpng\pngwrite.c" />
+    <ClCompile Include="third_party\libpng\pngwtran.c" />
+    <ClCompile Include="third_party\libpng\pngwutil.c" />
+    <ClCompile Include="third_party\zlib\adler32.c" />
+    <ClCompile Include="third_party\zlib\compress.c" />
+    <ClCompile Include="third_party\zlib\crc32.c" />
+    <ClCompile Include="third_party\zlib\deflate.c" />
+    <ClCompile Include="third_party\zlib\gzclose.c" />
+    <ClCompile Include="third_party\zlib\gzlib.c" />
+    <ClCompile Include="third_party\zlib\gzread.c" />
+    <ClCompile Include="third_party\zlib\gzwrite.c" />
+    <ClCompile Include="third_party\zlib\infback.c" />
+    <ClCompile Include="third_party\zlib\inffast.c" />
+    <ClCompile Include="third_party\zlib\inflate.c" />
+    <ClCompile Include="third_party\zlib\inftrees.c" />
+    <ClCompile Include="third_party\zlib\trees.c" />
+    <ClCompile Include="third_party\zlib\uncompr.c" />
+    <ClCompile Include="third_party\zlib\zutil.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def" />
+    <None Include="third_party\zlib\inffas32.asm" />
+    <None Include="third_party\zlib\match32.asm" />
+    <None Include="third_party\zlib\match686.asm" />
+    <None Include="third_party\zlib\zlib.def" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters
index ec134ccc..9362cd94 100644
--- a/guetzli_static.vcxproj.filters
+++ b/guetzli_static.vcxproj.filters
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="guetzli">
@@ -13,6 +13,12 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
+    <Filter Include="third_party\libpng">
+      <UniqueIdentifier>{61f0e3eb-c213-49c5-883a-060bdaf927bb}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="third_party\zlib">
+      <UniqueIdentifier>{ba7b6163-a7d1-4f14-b4b3-3d35f296563a}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -93,6 +99,48 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
+    <ClInclude Include="third_party\libpng\png.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngconf.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngpriv.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\crc32.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\deflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\gzguts.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffast.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffixed.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inftrees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\trees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zconf.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zlib.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zutil.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -155,5 +203,112 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
+    <ClCompile Include="third_party\libpng\png.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngerror.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngget.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngmem.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngpread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngset.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngtrans.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwrite.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\adler32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\compress.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\crc32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\deflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzclose.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzlib.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzread.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzwrite.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\infback.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inffast.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inftrees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\trees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\uncompr.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\zutil.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def">
+      <Filter>third_party\libpng</Filter>
+    </None>
+    <None Include="third_party\zlib\inffas32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match686.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\zlib.def">
+      <Filter>third_party\zlib</Filter>
+    </None>
   </ItemGroup>
 </Project>
\ No newline at end of file

From 9ff693c27f04484f8a55a94b37d27e422edf0f86 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 25 Apr 2017 12:53:07 +0800
Subject: [PATCH 002/189] add sample picture

---
 guetzli.vcxproj.user | 8 --------
 1 file changed, 8 deletions(-)
 delete mode 100644 guetzli.vcxproj.user

diff --git a/guetzli.vcxproj.user b/guetzli.vcxproj.user
deleted file mode 100644
index da467b73..00000000
--- a/guetzli.vcxproj.user
+++ /dev/null
@@ -1,8 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LocalDebuggerCommandArguments>test.jpg out.jpg</LocalDebuggerCommandArguments>
-    <LocalDebuggerWorkingDirectory>$(OutDir)</LocalDebuggerWorkingDirectory>
-    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
-  </PropertyGroup>
-</Project>
\ No newline at end of file

From fb1032b3edbc9b3c5213e80a4084e3f113b4bd61 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 26 Apr 2017 10:55:14 +0800
Subject: [PATCH 003/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 guetzli.vcxproj                               | 151 +++++++---
 guetzli.vcxproj.filters                       | 258 +++++++++++++++++-
 .../butteraugli/butteraugli/butteraugli.cc    |  38 ++-
 3 files changed, 398 insertions(+), 49 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index dd49fa15..05a625ec 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -113,22 +113,24 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>Full</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <MinimalRebuild>false</MinimalRebuild>
-      <StringPooling>true</StringPooling>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <MinimalRebuild>false</MinimalRebuild>
+      <StringPooling>true</StringPooling>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -147,21 +149,23 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h" />
     <ClInclude Include="guetzli\color_transform.h" />
     <ClInclude Include="guetzli\comparator.h" />
@@ -185,9 +189,58 @@
     <ClInclude Include="guetzli\processor.h" />
     <ClInclude Include="guetzli\quality.h" />
     <ClInclude Include="guetzli\quantize.h" />
-    <ClInclude Include="guetzli\score.h" />
-    <ClInclude Include="guetzli\stats.h" />
-    <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
+    <ClInclude Include="guetzli\score.h" />
+    <ClInclude Include="guetzli\stats.h" />
+    <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h" />
     <ClInclude Include="third_party\libpng\png.h" />
     <ClInclude Include="third_party\libpng\pngconf.h" />
     <ClInclude Include="third_party\libpng\pngpriv.h" />
@@ -222,9 +275,41 @@
     <ClCompile Include="guetzli\preprocess_downsample.cc" />
     <ClCompile Include="guetzli\processor.cc" />
     <ClCompile Include="guetzli\quality.cc" />
-    <ClCompile Include="guetzli\quantize.cc" />
-    <ClCompile Include="guetzli\score.cc" />
-    <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
+    <ClCompile Include="guetzli\quantize.cc" />
+    <ClCompile Include="guetzli\score.cc" />
+    <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc" />
     <ClCompile Include="third_party\libpng\png.c" />
     <ClCompile Include="third_party\libpng\pngerror.c" />
     <ClCompile Include="third_party\libpng\pngget.c" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index be2fe5a3..b35df618 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -19,9 +19,12 @@
     <Filter Include="third_party\zlib">
       <UniqueIdentifier>{cb89c1ac-8399-4814-88f2-4b69576bc9f9}</UniqueIdentifier>
     </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="guetzli\butteraugli_comparator.h">
+    <Filter Include="third_party\tcmalloc_minimal">
+      <UniqueIdentifier>{f2b475de-6219-478e-9e5e-08f07ef25dbc}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="guetzli\butteraugli_comparator.h">
       <Filter>guetzli</Filter>
     </ClInclude>
     <ClInclude Include="guetzli\color_transform.h">
@@ -141,9 +144,156 @@
     <ClInclude Include="third_party\zlib\zutil.h">
       <Filter>third_party\zlib</Filter>
     </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="guetzli\butteraugli_comparator.cc">
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="guetzli\butteraugli_comparator.cc">
       <Filter>guetzli</Filter>
     </ClCompile>
     <ClCompile Include="guetzli\dct_double.cc">
@@ -296,6 +446,102 @@
     <ClCompile Include="third_party\zlib\zutil.c">
       <Filter>third_party\zlib</Filter>
     </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 7bfae8b1..39af122a 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1304,6 +1304,9 @@ double MaskDcB(double delta) {
 // square_size square with coordinates
 //   x - offset .. x + square_size - offset - 1,
 //   y - offset .. y + square_size - offset - 1.
+
+// ʵ�ʹ�����squre_sizeһֱΪ4��offsetΪ0������SIMD�ػ�
+
 void MinSquareVal(size_t square_size, size_t offset,
                   size_t xsize, size_t ysize,
                   float *values) {
@@ -1311,26 +1314,41 @@ void MinSquareVal(size_t square_size, size_t offset,
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
+
   for (size_t y = 0; y < ysize; ++y) {
     const size_t minh = offset > y ? 0 : y - offset;
     const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
+
+    float *pTmpPoint = &tmp[y * xsize];
+    float *pValuePoint = &values[minh * xsize];
+
     for (size_t x = 0; x < xsize; ++x) {
-      double min = values[x + minh * xsize];
-      for (size_t j = minh + 1; j < maxh; ++j) {
-        min = fmin(min, values[x + j * xsize]);
-      }
-      tmp[x + y * xsize] = static_cast<float>(min);
+        float *pValues = pValuePoint++;
+        float min = *pValues;
+
+        for (size_t j = minh + 1; j < maxh; ++j) {
+            pValues += xsize;
+            if (*pValues < min) min = *pValues;
+        }
+        *pTmpPoint++ = min;
     }
   }
   for (size_t x = 0; x < xsize; ++x) {
     const size_t minw = offset > x ? 0 : x - offset;
     const size_t maxw = std::min<size_t>(xsize, x + square_size - offset);
+
+    float *pValuePoint = &values[x];
+    float *pTmpPoint = &tmp[minw];
+
     for (size_t y = 0; y < ysize; ++y) {
-      double min = tmp[minw + y * xsize];
-      for (size_t j = minw + 1; j < maxw; ++j) {
-        min = fmin(min, tmp[j + y * xsize]);
-      }
-      values[x + y * xsize] = static_cast<float>(min);
+        float * pTmp = pTmpPoint; pTmpPoint += xsize;
+        float min = *pTmp;
+
+        for (size_t j = minw + 1; j < maxw; ++j) {
+            pTmp++;
+            if (*pTmp < min) min = *pTmp;
+        }
+        *pValuePoint = min; pValuePoint += xsize;
     }
   }
 }

From e89cdcf362529b42c228f4ae4239151d5fa86a72 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 26 Apr 2017 17:40:31 +0800
Subject: [PATCH 004/189] float is enough

---
 third_party/butteraugli/butteraugli/butteraugli.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 39af122a..8871bcdb 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -64,25 +64,25 @@ static void Convolution(size_t xsize, size_t ysize,
                         size_t len, size_t offset,
                         const float* __restrict__ multipliers,
                         const float* __restrict__ inp,
-                        double border_ratio,
+                        float border_ratio,
                         float* __restrict__ result) {
   PROFILER_FUNC;
-  double weight_no_border = 0;
+  float weight_no_border = 0;
   for (size_t j = 0; j <= 2 * offset; ++j) {
     weight_no_border += multipliers[j];
   }
   for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) {
     int minx = x < offset ? 0 : x - offset;
     int maxx = std::min(xsize, x + len - offset) - 1;
-    double weight = 0.0;
+    float weight = 0.0;
     for (int j = minx; j <= maxx; ++j) {
       weight += multipliers[j - x + offset];
     }
     // Interpolate linearly between the no-border scaling and border scaling.
     weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-    double scale = 1.0 / weight;
+    float scale = 1.0 / weight;
     for (size_t y = 0; y < ysize; ++y) {
-      double sum = 0.0;
+      float sum = 0.0;
       for (int j = minx; j <= maxx; ++j) {
         sum += inp[y * xsize + j] * multipliers[j - x + offset];
       }
@@ -739,6 +739,7 @@ const double *GetOpsinAbsorbance() {
   return &kMix[0];
 }
 
+// mix��һ��[4x4]������in[,,,1]���в��
 void OpsinAbsorbance(const double in[3], double out[3]) {
   const double *mix = GetOpsinAbsorbance();
   out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3];

From fe645a92b4e2f393a03a64914dae86718ac3af4e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 27 Apr 2017 17:23:23 +0800
Subject: [PATCH 005/189] Add OpenCL Support

---
 clguetzli/clguetzli.cl                        |  25 +
 clguetzli/clguetzli.cpp                       |  76 +++
 clguetzli/clguetzli.h                         |   5 +
 clguetzli/ocl.cpp                             | 470 ++++++++++++++++++
 clguetzli/ocl.h                               |  64 +++
 clguetzli/utils.cpp                           |  96 ++++
 clguetzli/utils.h                             |  36 ++
 guetzli.vcxproj                               |  33 +-
 guetzli.vcxproj.filters                       |  26 +-
 guetzli_static.vcxproj                        |   4 +-
 .../butteraugli/butteraugli/butteraugli.cc    |   6 +
 11 files changed, 826 insertions(+), 15 deletions(-)
 create mode 100644 clguetzli/clguetzli.cl
 create mode 100644 clguetzli/clguetzli.cpp
 create mode 100644 clguetzli/clguetzli.h
 create mode 100644 clguetzli/ocl.cpp
 create mode 100644 clguetzli/ocl.h
 create mode 100644 clguetzli/utils.cpp
 create mode 100644 clguetzli/utils.h

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
new file mode 100644
index 00000000..d71249b3
--- /dev/null
+++ b/clguetzli/clguetzli.cl
@@ -0,0 +1,25 @@
+__kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	const int width = get_global_size(0);
+	const int height = get_global_size(1);
+
+	int minH = offset > y ? 0 : y - offset;
+	int maxH = y + square_size - offset > height ? y + square_size - offset : height;
+
+	int minW = offset > x ? 0 : x - offset;
+	int maxW = x + square_size - offset > width ? x + square_size - offset : width;
+
+	float minValue = pA[minH * width + minW];
+
+	for (int j = minH; j < maxH; j++)
+	{
+		for (int i = minW; i < maxW; i++)
+		{
+			float tmp = pA[j * width + i];
+			if (tmp < minValue) minValue = tmp;
+		}
+	}
+	pC[y * width + x] = minValue;
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
new file mode 100644
index 00000000..377a468b
--- /dev/null
+++ b/clguetzli/clguetzli.cpp
@@ -0,0 +1,76 @@
+#include "clguetzli.h"
+#include "ocl.h"
+
+void clMinSquareVal(size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values)
+{
+	cl_int err = CL_SUCCESS;
+
+	ocl_args_d_t ocl;
+	SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
+
+	cl_uint optimizedSize = ((sizeof(cl_float) * xsize * ysize - 1) / 64 + 1) * 64;
+	cl_float* inputA = (cl_float*)_aligned_malloc(optimizedSize, 4096);
+	cl_float* outputC = (cl_float*)_aligned_malloc(optimizedSize, 4096);
+
+	memcpy(inputA, values, sizeof(cl_float) * xsize * ysize);
+
+	ocl.srcA = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, inputA, &err);
+	ocl.dstMem = clCreateBuffer(ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, outputC, &err);
+
+	char* source = nullptr;
+	size_t src_size = 0;
+	ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size);
+
+	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
+
+	delete[] source;
+
+	err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+	ocl.kernel = clCreateKernel(ocl.program, "MinSquareVal", &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	cl_int cloffset = offset;
+	cl_int clsquare_size = square_size;
+
+	clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clsquare_size);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
+	}
+
+	memcpy(values, resultPtr, sizeof(cl_float) * xsize * ysize);
+
+	_aligned_free(inputA);
+	_aligned_free(outputC);
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
new file mode 100644
index 00000000..a6cf8242
--- /dev/null
+++ b/clguetzli/clguetzli.h
@@ -0,0 +1,5 @@
+#pragma once
+
+void clMinSquareVal(size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values);
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
new file mode 100644
index 00000000..077d3464
--- /dev/null
+++ b/clguetzli/ocl.cpp
@@ -0,0 +1,470 @@
+#include "ocl.h"
+#include <vector>
+
+
+ocl_args_d_t::ocl_args_d_t() :
+	context(NULL),
+	device(NULL),
+	commandQueue(NULL),
+	program(NULL),
+	kernel(NULL),
+	platformVersion(OPENCL_VERSION_1_2),
+	deviceVersion(OPENCL_VERSION_1_2),
+	compilerVersion(OPENCL_VERSION_1_2),
+	srcA(NULL),
+	srcB(NULL),
+	dstMem(NULL)
+{
+}
+
+/*
+* destructor - called only once
+* Release all OpenCL objects
+* This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL.
+*
+* You may want to call these deallocation procedures in the middle of your application execution
+* (not at the end) if you don't further need OpenCL runtime.
+* You may want to do that in order to free some memory, for example,
+* or recreate OpenCL objects with different parameters.
+*
+*/
+ocl_args_d_t::~ocl_args_d_t()
+{
+	cl_int err = CL_SUCCESS;
+
+	if (kernel)
+	{
+		err = clReleaseKernel(kernel);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (program)
+	{
+		err = clReleaseProgram(program);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (srcA)
+	{
+		err = clReleaseMemObject(srcA);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (srcB)
+	{
+		err = clReleaseMemObject(srcB);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (dstMem)
+	{
+		err = clReleaseMemObject(dstMem);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (commandQueue)
+	{
+		err = clReleaseCommandQueue(commandQueue);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (device)
+	{
+		err = clReleaseDevice(device);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+	if (context)
+	{
+		err = clReleaseContext(context);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+
+	/*
+	* Note there is no procedure to deallocate platform
+	* because it was not created at the startup,
+	* but just queried from OpenCL runtime.
+	*/
+}
+
+const char* TranslateOpenCLError(cl_int errorCode)
+{
+	switch (errorCode)
+	{
+	case CL_SUCCESS:                            return "CL_SUCCESS";
+	case CL_DEVICE_NOT_FOUND:                   return "CL_DEVICE_NOT_FOUND";
+	case CL_DEVICE_NOT_AVAILABLE:               return "CL_DEVICE_NOT_AVAILABLE";
+	case CL_COMPILER_NOT_AVAILABLE:             return "CL_COMPILER_NOT_AVAILABLE";
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case CL_OUT_OF_RESOURCES:                   return "CL_OUT_OF_RESOURCES";
+	case CL_OUT_OF_HOST_MEMORY:                 return "CL_OUT_OF_HOST_MEMORY";
+	case CL_PROFILING_INFO_NOT_AVAILABLE:       return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case CL_MEM_COPY_OVERLAP:                   return "CL_MEM_COPY_OVERLAP";
+	case CL_IMAGE_FORMAT_MISMATCH:              return "CL_IMAGE_FORMAT_MISMATCH";
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case CL_BUILD_PROGRAM_FAILURE:              return "CL_BUILD_PROGRAM_FAILURE";
+	case CL_MAP_FAILURE:                        return "CL_MAP_FAILURE";
+	case CL_MISALIGNED_SUB_BUFFER_OFFSET:       return "CL_MISALIGNED_SUB_BUFFER_OFFSET";                          //-13
+	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:    return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";   //-14
+	case CL_COMPILE_PROGRAM_FAILURE:            return "CL_COMPILE_PROGRAM_FAILURE";                               //-15
+	case CL_LINKER_NOT_AVAILABLE:               return "CL_LINKER_NOT_AVAILABLE";                                  //-16
+	case CL_LINK_PROGRAM_FAILURE:               return "CL_LINK_PROGRAM_FAILURE";                                  //-17
+	case CL_DEVICE_PARTITION_FAILED:            return "CL_DEVICE_PARTITION_FAILED";                               //-18
+	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:      return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";                         //-19
+	case CL_INVALID_VALUE:                      return "CL_INVALID_VALUE";
+	case CL_INVALID_DEVICE_TYPE:                return "CL_INVALID_DEVICE_TYPE";
+	case CL_INVALID_PLATFORM:                   return "CL_INVALID_PLATFORM";
+	case CL_INVALID_DEVICE:                     return "CL_INVALID_DEVICE";
+	case CL_INVALID_CONTEXT:                    return "CL_INVALID_CONTEXT";
+	case CL_INVALID_QUEUE_PROPERTIES:           return "CL_INVALID_QUEUE_PROPERTIES";
+	case CL_INVALID_COMMAND_QUEUE:              return "CL_INVALID_COMMAND_QUEUE";
+	case CL_INVALID_HOST_PTR:                   return "CL_INVALID_HOST_PTR";
+	case CL_INVALID_MEM_OBJECT:                 return "CL_INVALID_MEM_OBJECT";
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case CL_INVALID_IMAGE_SIZE:                 return "CL_INVALID_IMAGE_SIZE";
+	case CL_INVALID_SAMPLER:                    return "CL_INVALID_SAMPLER";
+	case CL_INVALID_BINARY:                     return "CL_INVALID_BINARY";
+	case CL_INVALID_BUILD_OPTIONS:              return "CL_INVALID_BUILD_OPTIONS";
+	case CL_INVALID_PROGRAM:                    return "CL_INVALID_PROGRAM";
+	case CL_INVALID_PROGRAM_EXECUTABLE:         return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case CL_INVALID_KERNEL_NAME:                return "CL_INVALID_KERNEL_NAME";
+	case CL_INVALID_KERNEL_DEFINITION:          return "CL_INVALID_KERNEL_DEFINITION";
+	case CL_INVALID_KERNEL:                     return "CL_INVALID_KERNEL";
+	case CL_INVALID_ARG_INDEX:                  return "CL_INVALID_ARG_INDEX";
+	case CL_INVALID_ARG_VALUE:                  return "CL_INVALID_ARG_VALUE";
+	case CL_INVALID_ARG_SIZE:                   return "CL_INVALID_ARG_SIZE";
+	case CL_INVALID_KERNEL_ARGS:                return "CL_INVALID_KERNEL_ARGS";
+	case CL_INVALID_WORK_DIMENSION:             return "CL_INVALID_WORK_DIMENSION";
+	case CL_INVALID_WORK_GROUP_SIZE:            return "CL_INVALID_WORK_GROUP_SIZE";
+	case CL_INVALID_WORK_ITEM_SIZE:             return "CL_INVALID_WORK_ITEM_SIZE";
+	case CL_INVALID_GLOBAL_OFFSET:              return "CL_INVALID_GLOBAL_OFFSET";
+	case CL_INVALID_EVENT_WAIT_LIST:            return "CL_INVALID_EVENT_WAIT_LIST";
+	case CL_INVALID_EVENT:                      return "CL_INVALID_EVENT";
+	case CL_INVALID_OPERATION:                  return "CL_INVALID_OPERATION";
+	case CL_INVALID_GL_OBJECT:                  return "CL_INVALID_GL_OBJECT";
+	case CL_INVALID_BUFFER_SIZE:                return "CL_INVALID_BUFFER_SIZE";
+	case CL_INVALID_MIP_LEVEL:                  return "CL_INVALID_MIP_LEVEL";
+	case CL_INVALID_GLOBAL_WORK_SIZE:           return "CL_INVALID_GLOBAL_WORK_SIZE";                           //-63
+	case CL_INVALID_PROPERTY:                   return "CL_INVALID_PROPERTY";                                   //-64
+	case CL_INVALID_IMAGE_DESCRIPTOR:           return "CL_INVALID_IMAGE_DESCRIPTOR";                           //-65
+	case CL_INVALID_COMPILER_OPTIONS:           return "CL_INVALID_COMPILER_OPTIONS";                           //-66
+	case CL_INVALID_LINKER_OPTIONS:             return "CL_INVALID_LINKER_OPTIONS";                             //-67
+	case CL_INVALID_DEVICE_PARTITION_COUNT:     return "CL_INVALID_DEVICE_PARTITION_COUNT";                     //-68
+																												//    case CL_INVALID_PIPE_SIZE:                  return "CL_INVALID_PIPE_SIZE";                                  //-69
+																												//    case CL_INVALID_DEVICE_QUEUE:               return "CL_INVALID_DEVICE_QUEUE";                               //-70
+
+	default:
+		return "UNKNOWN ERROR CODE";
+	}
+}
+
+
+/*
+* Check whether an OpenCL platform is the required platform
+* (based on the platform's name)
+*/
+bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform)
+{
+	size_t stringLength = 0;
+	cl_int err = CL_SUCCESS;
+	bool match = false;
+
+	// In order to read the platform's name, we first read the platform's name string length (param_value is NULL).
+	// The value returned in stringLength
+	err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err));
+		return false;
+	}
+
+	// Now, that we know the platform's name string length, we can allocate enough space before read it
+	std::vector<char> platformName(stringLength);
+
+	// Read the platform's name string
+	// The read value returned in platformName
+	err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err));
+		return false;
+	}
+
+	// Now check if the platform's name is the required one
+	if (strstr(&platformName[0], preferredPlatform) != 0)
+	{
+		// The checked platform is the one we're looking for
+		match = true;
+	}
+
+	return match;
+}
+
+/*
+* Find and return the preferred OpenCL platform
+* In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned
+*/
+cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType)
+{
+	cl_uint numPlatforms = 0;
+	cl_int err = CL_SUCCESS;
+
+	// Get (in numPlatforms) the number of OpenCL platforms available
+	// No platform ID will be return, since platforms is NULL
+	err = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err));
+		return NULL;
+	}
+	LogInfo("Number of available platforms: %u\n", numPlatforms);
+
+	if (0 == numPlatforms)
+	{
+		LogError("Error: No platforms found!\n");
+		return NULL;
+	}
+
+	std::vector<cl_platform_id> platforms(numPlatforms);
+
+	// Now, obtains a list of numPlatforms OpenCL platforms available
+	// The list of platforms available will be returned in platforms
+	err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err));
+		return NULL;
+	}
+
+	// Check if one of the available platform matches the preferred requirements
+	for (cl_uint i = 0; i < numPlatforms; i++)
+	{
+		bool match = true;
+		cl_uint numDevices = 0;
+
+		// If the preferredPlatform is not NULL then check if platforms[i] is the required one
+		// Otherwise, continue the check with platforms[i]
+		if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
+		{
+			// In case we're looking for a specific platform
+			match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform);
+		}
+
+		// match is true if the platform's name is the required one or don't care (NULL)
+		if (match)
+		{
+			// Obtains the number of deviceType devices available on platform
+			// When the function failed we expect numDevices to be zero.
+			// We ignore the function return value since a non-zero error code
+			// could happen if this platform doesn't support the specified device type.
+			err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
+			if (CL_SUCCESS != err)
+			{
+				LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err));
+			}
+
+			if (0 != numDevices)
+			{
+				// There is at list one device that answer the requirements
+				return platforms[i];
+			}
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+* This function read the OpenCL platdorm and device versions
+* (using clGetxxxInfo API) and stores it in the ocl structure.
+* Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices
+* in the same program.
+*/
+int GetPlatformAndDeviceVersion(cl_platform_id platformId, ocl_args_d_t *ocl)
+{
+	cl_int err = CL_SUCCESS;
+
+	// Read the platform's version string length (param_value is NULL).
+	// The value returned in stringLength
+	size_t stringLength = 0;
+	err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Now, that we know the platform's version string length, we can allocate enough space before read it
+	std::vector<char> platformVersion(stringLength);
+
+	// Read the platform's version string
+	// The read value returned in platformVersion
+	err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL)
+	{
+		ocl->platformVersion = OPENCL_VERSION_2_0;
+	}
+
+	// Read the device's version string length (param_value is NULL).
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Now, that we know the device's version string length, we can allocate enough space before read it
+	std::vector<char> deviceVersion(stringLength);
+
+	// Read the device's version string
+	// The read value returned in deviceVersion
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL)
+	{
+		ocl->deviceVersion = OPENCL_VERSION_2_0;
+	}
+
+	// Read the device's OpenCL C version string length (param_value is NULL).
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it
+	std::vector<char> compilerVersion(stringLength);
+
+	// Read the device's OpenCL C version string
+	// The read value returned in compilerVersion
+	err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL)
+	{
+		ocl->compilerVersion = OPENCL_VERSION_2_0;
+	}
+
+	return err;
+}
+
+
+/*
+* This function picks/creates necessary OpenCL objects which are needed.
+* The objects are:
+* OpenCL platform, device, context, and command queue.
+*
+* All these steps are needed to be performed once in a regular OpenCL application.
+* This happens before actual compute kernels calls are performed.
+*
+* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
+* so this function populates fields of this structure, which is passed as parameter ocl.
+* Please, consider reviewing the fields before going further.
+* The structure definition is right in the beginning of this file.
+*/
+int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
+{
+	// The following variable stores return codes for all OpenCL calls.
+	cl_int err = CL_SUCCESS;
+
+	// Query for all available OpenCL platforms on the system
+	// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
+	deviceType = CL_DEVICE_TYPE_GPU;
+	cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
+	if (NULL == platformId)
+	{
+		deviceType = CL_DEVICE_TYPE_CPU;
+		platformId = FindOpenCLPlatform("", deviceType);
+	}
+
+	if (NULL == platformId)
+	{
+		LogError("Error: Failed to find OpenCL platform.\n");
+		return CL_INVALID_VALUE;
+	}
+
+	// Create context with device of specified type.
+	// Required device type is passed as function argument deviceType.
+	// So you may use this function to create context for any CPU or GPU OpenCL device.
+	// The creation is synchronized (pfn_notify is NULL) and NULL user_data
+	cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 };
+	ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
+	if ((CL_SUCCESS != err) || (NULL == ocl->context))
+	{
+		LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Query for OpenCL device which was used for context creation
+	err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	// Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
+	GetPlatformAndDeviceVersion(platformId, ocl);
+
+	// Create command queue.
+	// OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
+	// Command queue guarantees some ordering between calls and other OpenCL commands.
+	// Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
+#ifdef CL_VERSION_2_0
+	if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
+	{
+		const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 };
+		ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
+	}
+	else {
+		// default behavior: OpenCL 1.2
+		cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
+		ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
+	}
+#else
+	// default behavior: OpenCL 1.2
+	cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
+	ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
+#endif
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
+		return err;
+	}
+
+	return CL_SUCCESS;
+}
\ No newline at end of file
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
new file mode 100644
index 00000000..2e2cf02c
--- /dev/null
+++ b/clguetzli/ocl.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "CL\cl.h"
+#include "utils.h"
+
+// Macros for OpenCL versions
+#define OPENCL_VERSION_1_2  1.2f
+#define OPENCL_VERSION_2_0  2.0f
+
+struct ocl_args_d_t;
+
+/* This function helps to create informative messages in
+* case when OpenCL errors occur. It returns a string
+* representation for an OpenCL error code.
+* (E.g. "CL_DEVICE_NOT_FOUND" instead of just -1.)
+*/
+const char* TranslateOpenCLError(cl_int errorCode);
+
+/*
+* This function picks/creates necessary OpenCL objects which are needed.
+* The objects are:
+* OpenCL platform, device, context, and command queue.
+*
+* All these steps are needed to be performed once in a regular OpenCL application.
+* This happens before actual compute kernels calls are performed.
+*
+* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
+* so this function populates fields of this structure, which is passed as parameter ocl.
+* Please, consider reviewing the fields before going further.
+* The structure definition is right in the beginning of this file.
+*/
+int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
+
+
+/* Convenient container for all OpenCL specific objects used in the sample
+*
+* It consists of two parts:
+*   - regular OpenCL objects which are used in almost each normal OpenCL applications
+*   - several OpenCL objects that are specific for this particular sample
+*
+* You collect all these objects in one structure for utility purposes
+* only, there is no OpenCL specific here: just to avoid global variables
+* and make passing all these arguments in functions easier.
+*/
+struct ocl_args_d_t
+{
+	ocl_args_d_t();
+	~ocl_args_d_t();
+
+	// Regular OpenCL objects:
+	cl_context       context;           // hold the context handler
+	cl_device_id     device;            // hold the selected device handler
+	cl_command_queue commandQueue;      // hold the commands-queue handler
+	cl_program       program;           // hold the program handler
+	cl_kernel        kernel;            // hold the kernel handler
+	float            platformVersion;   // hold the OpenCL platform version (default 1.2)
+	float            deviceVersion;     // hold the OpenCL device version (default. 1.2)
+	float            compilerVersion;   // hold the device OpenCL C version (default. 1.2)
+
+										// Objects that are specific for algorithm implemented in this sample
+	cl_mem           srcA;              // hold first source buffer
+	cl_mem           srcB;              // hold second source buffer
+	cl_mem           dstMem;            // hold destination buffer
+};
diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp
new file mode 100644
index 00000000..24520cd8
--- /dev/null
+++ b/clguetzli/utils.cpp
@@ -0,0 +1,96 @@
+/*****************************************************************************
+ * Copyright (c) 2013-2016 Intel Corporation
+ * All rights reserved.
+ *
+ * WARRANTY DISCLAIMER
+ *
+ * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+ * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel Corporation is the author of the Materials, and requests that all
+ * problem reports or change requests be submitted to it directly
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <tchar.h>
+#include <memory.h>
+#include <windows.h>
+#include "CL\cl.h"
+#include "CL\cl_ext.h"
+#include "utils.h"
+#include <assert.h>
+
+
+//we want to use POSIX functions
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+
+
+void LogInfo(const char* str, ...)
+{
+    if (str)
+    {
+        va_list args;
+        va_start(args, str);
+
+        vfprintf(stdout, str, args);
+
+        va_end(args);
+    }
+}
+
+void LogError(const char* str, ...)
+{
+    if (str)
+    {
+        va_list args;
+        va_start(args, str);
+
+        vfprintf(stderr, str, args);
+
+        va_end(args);
+    }
+}
+
+// Upload the OpenCL C source code to output argument source
+// The memory resource is implicitly allocated in the function
+// and should be deallocated by the caller
+int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
+{
+    int errorCode = CL_SUCCESS;
+
+    FILE* fp = NULL;
+    fopen_s(&fp, fileName, "rb");
+    if (fp == NULL)
+    {
+        LogError("Error: Couldn't find program source file '%s'.\n", fileName);
+        errorCode = CL_INVALID_VALUE;
+    }
+    else {
+        fseek(fp, 0, SEEK_END);
+        *sourceSize = ftell(fp);
+        fseek(fp, 0, SEEK_SET);
+
+        *source = new char[*sourceSize];
+        if (*source == NULL)
+        {
+            LogError("Error: Couldn't allocate %d bytes for program source from file '%s'.\n", *sourceSize, fileName);
+            errorCode = CL_OUT_OF_HOST_MEMORY;
+        }
+        else {
+            fread(*source, 1, *sourceSize, fp);
+        }
+    }
+    return errorCode;
+}
+#pragma warning( pop )
\ No newline at end of file
diff --git a/clguetzli/utils.h b/clguetzli/utils.h
new file mode 100644
index 00000000..294f7137
--- /dev/null
+++ b/clguetzli/utils.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+ * Copyright (c) 2013-2016 Intel Corporation
+ * All rights reserved.
+ *
+ * WARRANTY DISCLAIMER
+ *
+ * THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+ * MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel Corporation is the author of the Materials, and requests that all
+ * problem reports or change requests be submitted to it directly
+ *****************************************************************************/
+
+#include "CL\cl.h"
+#include <d3d9.h>
+
+
+#pragma once
+
+// Print useful information to the default output. Same usage as with printf
+void LogInfo(const char* str, ...);
+
+// Print error notification to the default output. Same usage as with printf
+void LogError(const char* str, ...);
+
+// Read OpenCL source code from fileName and store it in source. The number of read bytes returns in sourceSize
+int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 05a625ec..cf770719 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -97,7 +97,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -108,8 +108,9 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -137,15 +138,16 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@@ -166,9 +168,12 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClInclude Include="guetzli\butteraugli_comparator.h" />
-    <ClInclude Include="guetzli\color_transform.h" />
-    <ClInclude Include="guetzli\comparator.h" />
+    <ClInclude Include="clguetzli\clguetzli.h" />
+    <ClInclude Include="clguetzli\ocl.h" />
+    <ClInclude Include="clguetzli\utils.h" />
+    <ClInclude Include="guetzli\butteraugli_comparator.h" />
+    <ClInclude Include="guetzli\color_transform.h" />
+    <ClInclude Include="guetzli\comparator.h" />
     <ClInclude Include="guetzli\dct_double.h" />
     <ClInclude Include="guetzli\debug_print.h" />
     <ClInclude Include="guetzli\entropy_encode.h" />
@@ -255,11 +260,14 @@
     <ClInclude Include="third_party\zlib\zconf.h" />
     <ClInclude Include="third_party\zlib\zlib.h" />
     <ClInclude Include="third_party\zlib\zutil.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="guetzli\butteraugli_comparator.cc" />
-    <ClCompile Include="guetzli\dct_double.cc" />
-    <ClCompile Include="guetzli\debug_print.cc" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="clguetzli\clguetzli.cpp" />
+    <ClCompile Include="clguetzli\ocl.cpp" />
+    <ClCompile Include="clguetzli\utils.cpp" />
+    <ClCompile Include="guetzli\butteraugli_comparator.cc" />
+    <ClCompile Include="guetzli\dct_double.cc" />
+    <ClCompile Include="guetzli\debug_print.cc" />
     <ClCompile Include="guetzli\entropy_encode.cc" />
     <ClCompile Include="guetzli\fdct.cc" />
     <ClCompile Include="guetzli\gamma_correct.cc" />
@@ -342,6 +350,7 @@
     <ClCompile Include="third_party\zlib\zutil.c" />
   </ItemGroup>
   <ItemGroup>
+    <None Include="clguetzli\clguetzli.cl" />
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />
     <None Include="third_party\zlib\match32.asm" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index b35df618..12e7d8f4 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -22,6 +22,9 @@
     <Filter Include="third_party\tcmalloc_minimal">
       <UniqueIdentifier>{f2b475de-6219-478e-9e5e-08f07ef25dbc}</UniqueIdentifier>
     </Filter>
+    <Filter Include="clguetzli">
+      <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -291,6 +294,15 @@
     <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h">
       <Filter>third_party\tcmalloc_minimal</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\utils.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\ocl.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -542,6 +554,15 @@
     <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc">
       <Filter>third_party\tcmalloc_minimal</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\utils.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\ocl.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">
@@ -559,5 +580,8 @@
     <None Include="third_party\zlib\zlib.def">
       <Filter>third_party\zlib</Filter>
     </None>
-  </ItemGroup>
+    <None Include="clguetzli\clguetzli.cl">
+      <Filter>clguetzli</Filter>
+    </None>
+  </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj
index 5d9dd9cd..44a911b2 100644
--- a/guetzli_static.vcxproj
+++ b/guetzli_static.vcxproj
@@ -93,7 +93,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -127,7 +127,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 8871bcdb..834cf2f8 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -40,6 +40,8 @@
 #include <algorithm>
 #include <array>
 
+#include "clguetzli\clguetzli.h"
+
 // Restricted pointers speed up Convolution(); MSVC uses a different keyword.
 #ifdef _MSC_VER
 #define __restrict__ __restrict
@@ -68,6 +70,7 @@ static void Convolution(size_t xsize, size_t ysize,
                         float* __restrict__ result) {
   PROFILER_FUNC;
   float weight_no_border = 0;
+
   for (size_t j = 0; j <= 2 * offset; ++j) {
     weight_no_border += multipliers[j];
   }
@@ -1311,6 +1314,9 @@ double MaskDcB(double delta) {
 void MinSquareVal(size_t square_size, size_t offset,
                   size_t xsize, size_t ysize,
                   float *values) {
+
+//	clMinSquareVal(square_size, offset, xsize, ysize, values);
+
   PROFILER_FUNC;
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);

From c72cece021ae550987696a59e2d252eb2c8de5e3 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 27 Apr 2017 19:38:16 +0800
Subject: [PATCH 006/189] MinSquareVal with OpenCL

---
 clguetzli/clguetzli.cl                        |  43 +++++++-
 clguetzli/clguetzli.cpp                       | 101 +++++++++++++++---
 clguetzli/clguetzli.h                         |   8 ++
 clguetzli/ocl.cpp                             |  69 +++++++++++-
 clguetzli/ocl.h                               |  13 +++
 .../butteraugli/butteraugli/butteraugli.cc    |   3 +-
 6 files changed, 216 insertions(+), 21 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index d71249b3..8d0aabff 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -6,10 +6,10 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 	const int height = get_global_size(1);
 
 	int minH = offset > y ? 0 : y - offset;
-	int maxH = y + square_size - offset > height ? y + square_size - offset : height;
+	int maxH = min(y + square_size - offset, height);// < height ? y + square_size - offset : height;
 
 	int minW = offset > x ? 0 : x - offset;
-	int maxW = x + square_size - offset > width ? x + square_size - offset : width;
+	int maxW = min(x + square_size - offset, width);// < width ? x + square_size - offset : width;
 
 	float minValue = pA[minH * width + minW];
 
@@ -21,5 +21,44 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 			if (tmp < minValue) minValue = tmp;
 		}
 	}
+
 	pC[y * width + x] = minValue;
 }
+
+__kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result,
+							int xstep, int len, int offset, float border_ratio)
+{
+	const int ox = get_global_id(0);
+	const int y = get_global_id(1);
+	const int oxsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	const int x = ox * xstep;
+	const int xsize = oxsize * xstep;
+
+	float weight_no_border = 0;
+	for (int j = 0; j <= 2 * offset; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+
+	int minx = x < offset ? 0 : x - offset;
+	int maxx = min(xsize, x + len - offset) - 1;
+
+	float weight = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		weight += multipliers[j - x + offset];
+	}
+
+	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+	float scale = 1.0 / weight;
+
+	float sum = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		sum += inp[y * xsize + j] * multipliers[j - x + offset];
+	}
+
+	result[y * oxsize + ox] = sum * scale;
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 377a468b..94a15040 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1,23 +1,21 @@
 #include "clguetzli.h"
 #include "ocl.h"
 
-void clMinSquareVal(size_t square_size, size_t offset,
-	size_t xsize, size_t ysize,
-	float *values)
+ocl_args_d_t& getOcl(void)
 {
-	cl_int err = CL_SUCCESS;
+	static bool bInit = false;
+	static ocl_args_d_t ocl;
 
-	ocl_args_d_t ocl;
-	SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
-
-	cl_uint optimizedSize = ((sizeof(cl_float) * xsize * ysize - 1) / 64 + 1) * 64;
-	cl_float* inputA = (cl_float*)_aligned_malloc(optimizedSize, 4096);
-	cl_float* outputC = (cl_float*)_aligned_malloc(optimizedSize, 4096);
+	if (bInit == true) return ocl;
 
-	memcpy(inputA, values, sizeof(cl_float) * xsize * ysize);
+	bInit = true;
+	SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
 
-	ocl.srcA = clCreateBuffer(ocl.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, inputA, &err);
-	ocl.dstMem = clCreateBuffer(ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * xsize * ysize, outputC, &err);
+	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
 
 	char* source = nullptr;
 	size_t src_size = 0;
@@ -38,13 +36,28 @@ void clMinSquareVal(size_t square_size, size_t offset,
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
 	}
 
+	return ocl;
+}
+
+void clMinSquareVal(size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	ocl.allocA(sizeof(cl_float) * xsize * ysize);
+	ocl.allocC(sizeof(cl_float) * xsize * ysize);
+
+	memcpy(ocl.inputA, values, sizeof(cl_float) * xsize * ysize);
+
 	cl_int cloffset = offset;
 	cl_int clsquare_size = square_size;
 
 	clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
 	clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clsquare_size);
+	clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
+	clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&cloffset);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -70,7 +83,61 @@ void clMinSquareVal(size_t square_size, size_t offset,
 	}
 
 	memcpy(values, resultPtr, sizeof(cl_float) * xsize * ysize);
+}
+
+void clConvolution(size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* multipliers,
+	const float* inp,
+	float border_ratio,
+	float* result)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	ocl.allocA(sizeof(cl_float) * len);
+	ocl.allocB(sizeof(cl_float) * xsize * ysize);
+	ocl.allocC(sizeof(cl_float) * xsize * ysize / xstep);
+
+	memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len);
+	memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize);
+
+	cl_int clxstep = xstep;
+	cl_int cllen = len;
+	cl_int cloffset = offset;
+	cl_float clborder_ratio = border_ratio;
+
+	clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB);
+	clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(ocl.kernel, 4, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(ocl.kernel, 5, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(ocl.kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
+
+	size_t globalWorkSize[2] = { xsize / xstep, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
+	}
 
-	_aligned_free(inputA);
-	_aligned_free(outputC);
+	memcpy(result, resultPtr, sizeof(cl_float) * xsize * ysize / xstep);
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index a6cf8242..31c2e7ba 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -3,3 +3,11 @@
 void clMinSquareVal(size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	float *values);
+
+void clConvolution(size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* multiplier,
+	const float* inp,
+	float border_ratio,
+	float* result);
\ No newline at end of file
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 077d3464..be1e9071 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -102,6 +102,73 @@ ocl_args_d_t::~ocl_args_d_t()
 	* because it was not created at the startup,
 	* but just queried from OpenCL runtime.
 	*/
+
+	if (inputA) _aligned_free(inputA);
+	if (inputB) _aligned_free(inputB);
+	if (outputC) _aligned_free(outputC);
+}
+
+void* ocl_args_d_t::allocA(size_t s)
+{
+	if (s < lenA) return inputA;
+	lenA = 0;
+	_aligned_free(inputA);
+	clReleaseMemObject(srcA);
+
+	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
+	inputA = _aligned_malloc(optimizedSize, 4096);
+	lenA = s;
+
+	cl_int err = 0;
+	srcA = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputA, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocA() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	return inputA;
+}
+
+void* ocl_args_d_t::allocB(size_t s)
+{
+	if (s < lenB) return inputB;
+	lenB = 0;
+	_aligned_free(inputB);
+	clReleaseMemObject(srcB);
+
+	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
+	inputB = _aligned_malloc(optimizedSize, 4096);
+	lenB = s;
+
+	cl_int err = 0;
+	srcB = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputB, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	return inputB;
+}
+
+void* ocl_args_d_t::allocC(size_t s)
+{
+	if (s < lenC) return outputC;
+	lenC = 0;
+	_aligned_free(outputC);
+	clReleaseMemObject(dstMem);
+
+	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
+	outputC = _aligned_malloc(optimizedSize, 4096);
+	lenC = s;
+
+	cl_int err = 0;
+	dstMem = clCreateBuffer(this->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, s, outputC, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	return outputC;
 }
 
 const char* TranslateOpenCLError(cl_int errorCode)
@@ -404,7 +471,7 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
 	// Query for all available OpenCL platforms on the system
 	// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
 	deviceType = CL_DEVICE_TYPE_GPU;
-	cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
+	cl_platform_id platformId = FindOpenCLPlatform("", deviceType);
 	if (NULL == platformId)
 	{
 		deviceType = CL_DEVICE_TYPE_CPU;
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 2e2cf02c..0a9e50b2 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -47,6 +47,10 @@ struct ocl_args_d_t
 	ocl_args_d_t();
 	~ocl_args_d_t();
 
+	void* allocA(size_t s);
+	void* allocB(size_t s);
+	void* allocC(size_t s);
+
 	// Regular OpenCL objects:
 	cl_context       context;           // hold the context handler
 	cl_device_id     device;            // hold the selected device handler
@@ -61,4 +65,13 @@ struct ocl_args_d_t
 	cl_mem           srcA;              // hold first source buffer
 	cl_mem           srcB;              // hold second source buffer
 	cl_mem           dstMem;            // hold destination buffer
+
+	void*			 inputA;
+	size_t		     lenA;
+
+	void*			 inputB;
+	size_t			 lenB;
+
+	void*			 outputC;
+	size_t			 lenC;
 };
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 834cf2f8..0f84aa48 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1315,7 +1315,8 @@ void MinSquareVal(size_t square_size, size_t offset,
                   size_t xsize, size_t ysize,
                   float *values) {
 
-//	clMinSquareVal(square_size, offset, xsize, ysize, values);
+	clMinSquareVal(square_size, offset, xsize, ysize, values);
+	return;
 
   PROFILER_FUNC;
   // offset is not negative and smaller than square_size.

From c354348db7b3eab528e0e346463db7449a52df81 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 28 Apr 2017 00:43:31 +0800
Subject: [PATCH 007/189] =?UTF-8?q?OpenCL=20=E4=BC=98=E5=8C=96=E5=8D=B7?=
 =?UTF-8?q?=E7=A7=AF=20=E7=94=B1=E4=BA=8E=E6=9C=89=E5=A4=A7=E9=87=8F8x8?=
 =?UTF-8?q?=E5=B0=8F=E5=9B=BE=E5=83=8F=E5=9D=97=E7=9A=84=E5=8D=B7=E7=A7=AF?=
 =?UTF-8?q?=E6=93=8D=E4=BD=9C=EF=BC=8C=E6=AD=A4=E5=A4=84GPU=E6=80=A7?=
 =?UTF-8?q?=E8=83=BD=E6=B2=A1=E6=9C=89=E5=BE=97=E5=88=B0=E6=9C=80=E5=A4=A7?=
 =?UTF-8?q?=E5=8F=91=E6=8C=A5=EF=BC=8C=E5=8F=AA=E9=92=88=E5=AF=B9=E5=A4=A7?=
 =?UTF-8?q?=E5=9B=BE=E5=83=8F=E5=9D=97=E9=87=87=E7=94=A8OpenCL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl                        |  8 ++--
 clguetzli/clguetzli.cpp                       | 47 ++++++++++++-------
 clguetzli/ocl.cpp                             | 30 +++++++++---
 clguetzli/ocl.h                               |  8 +++-
 .../butteraugli/butteraugli/butteraugli.cc    | 33 ++++++++++---
 5 files changed, 91 insertions(+), 35 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 8d0aabff..be73ceeb 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -26,15 +26,15 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 }
 
 __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result,
-							int xstep, int len, int offset, float border_ratio)
+							int xsize, int xstep, int len, int offset, float border_ratio)
 {
 	const int ox = get_global_id(0);
 	const int y = get_global_id(1);
+
 	const int oxsize = get_global_size(0);
 	const int ysize = get_global_size(1);
 
 	const int x = ox * xstep;
-	const int xsize = oxsize * xstep;
 
 	float weight_no_border = 0;
 	for (int j = 0; j <= 2 * offset; j++)
@@ -43,7 +43,7 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 	}
 
 	int minx = x < offset ? 0 : x - offset;
-	int maxx = min(xsize, x + len - offset) - 1;
+	int maxx = min(xsize, x + len - offset);
 
 	float weight = 0.0;
 	for (int j = minx; j < maxx; j++)
@@ -60,5 +60,5 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 		sum += inp[y * xsize + j] * multipliers[j - x + offset];
 	}
 
-	result[y * oxsize + ox] = sum * scale;
+	result[ox * ysize + y] = sum * scale;
 }
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 94a15040..2b598416 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -30,10 +30,15 @@ ocl_args_d_t& getOcl(void)
 	{
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
 	}
-	ocl.kernel = clCreateKernel(ocl.program, "MinSquareVal", &err);
+	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clCreateKernel(MinSquareVal) for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+	ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCreateKernel(Convolution) for source program returned %s.\n", TranslateOpenCLError(err));
 	}
 
 	return ocl;
@@ -54,13 +59,14 @@ void clMinSquareVal(size_t square_size, size_t offset,
 	cl_int cloffset = offset;
 	cl_int clsquare_size = square_size;
 
-	clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
-	clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(ocl.kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
-	clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&cloffset);
+	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
@@ -96,28 +102,33 @@ void clConvolution(size_t xsize, size_t ysize,
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
+	size_t oxsize = xsize / xstep;
+
 	ocl.allocA(sizeof(cl_float) * len);
 	ocl.allocB(sizeof(cl_float) * xsize * ysize);
-	ocl.allocC(sizeof(cl_float) * xsize * ysize / xstep);
+	ocl.allocC(sizeof(cl_float) * oxsize * ysize);
 
 	memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len);
 	memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize);
 
+	cl_int clxsize = xsize;
 	cl_int clxstep = xstep;
 	cl_int cllen = len;
 	cl_int cloffset = offset;
 	cl_float clborder_ratio = border_ratio;
 
-	clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
-	clSetKernelArg(ocl.kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB);
-	clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(ocl.kernel, 3, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(ocl.kernel, 4, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(ocl.kernel, 5, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(ocl.kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
 
 	size_t globalWorkSize[2] = { xsize / xstep, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, ocl.kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
@@ -128,7 +139,7 @@ void clConvolution(size_t xsize, size_t ysize,
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
 	}
 
-	cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
+	cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * oxsize * ysize, 0, NULL, NULL, &err);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
@@ -139,5 +150,5 @@ void clConvolution(size_t xsize, size_t ysize,
 		LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
 	}
 
-	memcpy(result, resultPtr, sizeof(cl_float) * xsize * ysize / xstep);
+	memcpy(result, resultPtr, sizeof(cl_float) * oxsize * ysize);
 }
\ No newline at end of file
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index be1e9071..50d3ad6c 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -7,14 +7,23 @@ ocl_args_d_t::ocl_args_d_t() :
 	device(NULL),
 	commandQueue(NULL),
 	program(NULL),
-	kernel(NULL),
 	platformVersion(OPENCL_VERSION_1_2),
 	deviceVersion(OPENCL_VERSION_1_2),
 	compilerVersion(OPENCL_VERSION_1_2),
 	srcA(NULL),
 	srcB(NULL),
-	dstMem(NULL)
+	dstMem(NULL),
+	inputA(NULL),
+	lenA(0),
+	inputB(NULL),
+	lenB(0),
+	outputC(NULL),
+	lenC(0)
 {
+	for (int i = 0; i < KERNEL_COUNT; i++)
+	{
+		kernel[i] = NULL;
+	}
 }
 
 /*
@@ -31,7 +40,15 @@ ocl_args_d_t::ocl_args_d_t() :
 ocl_args_d_t::~ocl_args_d_t()
 {
 	cl_int err = CL_SUCCESS;
-
+	for (int i = 0; i < KERNEL_COUNT; i++)
+	{
+		err = clReleaseKernel(kernel[i]);
+		if (CL_SUCCESS != err)
+		{
+			LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
+		}
+	}
+/*
 	if (kernel)
 	{
 		err = clReleaseKernel(kernel);
@@ -40,6 +57,7 @@ ocl_args_d_t::~ocl_args_d_t()
 			LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
 		}
 	}
+*/
 	if (program)
 	{
 		err = clReleaseProgram(program);
@@ -110,7 +128,7 @@ ocl_args_d_t::~ocl_args_d_t()
 
 void* ocl_args_d_t::allocA(size_t s)
 {
-	if (s < lenA) return inputA;
+	if (s <= lenA) return inputA;
 	lenA = 0;
 	_aligned_free(inputA);
 	clReleaseMemObject(srcA);
@@ -131,7 +149,7 @@ void* ocl_args_d_t::allocA(size_t s)
 
 void* ocl_args_d_t::allocB(size_t s)
 {
-	if (s < lenB) return inputB;
+	if (s <= lenB) return inputB;
 	lenB = 0;
 	_aligned_free(inputB);
 	clReleaseMemObject(srcB);
@@ -152,7 +170,7 @@ void* ocl_args_d_t::allocB(size_t s)
 
 void* ocl_args_d_t::allocC(size_t s)
 {
-	if (s < lenC) return outputC;
+	if (s <= lenC) return outputC;
 	lenC = 0;
 	_aligned_free(outputC);
 	clReleaseMemObject(dstMem);
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 0a9e50b2..5f21a0e3 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -42,6 +42,12 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 * only, there is no OpenCL specific here: just to avoid global variables
 * and make passing all these arguments in functions easier.
 */
+
+#define KERNEL_MINSQUAREVAL 0
+#define KERNEL_CONVOLUTION 1
+
+#define KERNEL_COUNT 2
+
 struct ocl_args_d_t
 {
 	ocl_args_d_t();
@@ -56,7 +62,7 @@ struct ocl_args_d_t
 	cl_device_id     device;            // hold the selected device handler
 	cl_command_queue commandQueue;      // hold the commands-queue handler
 	cl_program       program;           // hold the program handler
-	cl_kernel        kernel;            // hold the kernel handler
+	cl_kernel        kernel[KERNEL_COUNT];            // hold the kernel handler
 	float            platformVersion;   // hold the OpenCL platform version (default 1.2)
 	float            deviceVersion;     // hold the OpenCL device version (default. 1.2)
 	float            compilerVersion;   // hold the device OpenCL C version (default. 1.2)
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 0f84aa48..26cbca23 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -62,12 +62,19 @@ inline double DotProduct(const float u[3], const double v[3]) {
 
 // Computes a horizontal convolution and transposes the result.
 static void Convolution(size_t xsize, size_t ysize,
-                        size_t xstep,
-                        size_t len, size_t offset,
-                        const float* __restrict__ multipliers,
-                        const float* __restrict__ inp,
-                        float border_ratio,
-                        float* __restrict__ result) {
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* __restrict__ multipliers,
+	const float* __restrict__ inp,
+	float border_ratio,
+	float* __restrict__ result) {
+
+	if (xsize > 100 && ysize > 100)
+	{
+		clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+		return;
+	}
+
   PROFILER_FUNC;
   float weight_no_border = 0;
 
@@ -92,6 +99,20 @@ static void Convolution(size_t xsize, size_t ysize,
       result[ox * ysize + y] = static_cast<float>(sum * scale);
     }
   }
+
+  return;
+
+  // for verify
+  std::vector<float> tmp(xsize / xstep * ysize);
+  clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, &tmp[0]);
+
+  for (int i = 0; i < xsize / xstep * ysize; i++)
+  {
+	  if (fabs(result[i] - tmp[i]) > 0.0001)
+	  {
+		  tmp[i] = result[i];
+	  }
+  }
 }
 
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,

From 5d8ba53d419b21010b2d3462535b18c8b525115e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 28 Apr 2017 09:46:22 +0800
Subject: [PATCH 008/189] fix setupopencl

---
 clguetzli/clguetzli.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 2b598416..211e3fb4 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -9,8 +9,6 @@ ocl_args_d_t& getOcl(void)
 	if (bInit == true) return ocl;
 
 	bInit = true;
-	SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
-
 	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
 	if (CL_SUCCESS != err)
 	{

From 82265a603e37f1b892f19dd417238197355a4760 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 2 May 2017 11:55:37 +0800
Subject: [PATCH 009/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

# Conflicts:
#	third_party/butteraugli/butteraugli/butteraugli.cc
---
 clguetzli/clguetzli.cl                        |  4 +-
 clguetzli/clguetzli.cpp                       |  4 +-
 clguetzli/clguetzli.h                         |  2 +
 guetzli.vcxproj                               | 64 +++++++++++++------
 guetzli.vcxproj.filters                       |  6 +-
 guetzli/guetzli.cc                            | 10 ++-
 .../butteraugli/butteraugli/butteraugli.cc    | 37 +++++++++--
 7 files changed, 95 insertions(+), 32 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index be73ceeb..6159832d 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -6,10 +6,10 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 	const int height = get_global_size(1);
 
 	int minH = offset > y ? 0 : y - offset;
-	int maxH = min(y + square_size - offset, height);// < height ? y + square_size - offset : height;
+	int maxH = min(y + square_size - offset, height);
 
 	int minW = offset > x ? 0 : x - offset;
-	int maxW = min(x + square_size - offset, width);// < width ? x + square_size - offset : width;
+	int maxW = min(x + square_size - offset, width);
 
 	float minValue = pA[minH * width + minW];
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 211e3fb4..5db62cc2 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1,6 +1,8 @@
 #include "clguetzli.h"
 #include "ocl.h"
 
+extern bool g_useOpenCL = false;
+
 ocl_args_d_t& getOcl(void)
 {
 	static bool bInit = false;
@@ -17,7 +19,7 @@ ocl_args_d_t& getOcl(void)
 
 	char* source = nullptr;
 	size_t src_size = 0;
-	ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size);
+	ReadSourceFromFile("clguetzli.cl", &source, &src_size);
 
 	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
 
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 31c2e7ba..df3dbc1d 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,5 +1,7 @@
 #pragma once
 
+extern bool g_useOpenCL;
+
 void clMinSquareVal(size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	float *values);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index cf770719..fb32ae0f 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -51,6 +51,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -78,6 +79,8 @@
     <IntDir>obj\x86\Release\guetzli\</IntDir>
     <TargetName>guetzli</TargetName>
     <TargetExt>.exe</TargetExt>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
@@ -92,6 +95,8 @@
     <IntDir>obj\x86\Debug\guetzli\</IntDir>
     <TargetName>guetzli</TargetName>
     <TargetExt>.exe</TargetExt>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
@@ -109,30 +114,43 @@
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    </Link>
+    <CustomBuild>
+      <Command>"$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current            -bo="           "</Command>
+    </CustomBuild>
+    <CustomBuild>
+      <Message>OpenCL Code Builder</Message>
+    </CustomBuild>
+    <CustomBuild>
+      <LinkObjects>false</LinkObjects>
+    </CustomBuild>
+    <Intel_OpenCL_Build_Rules />
+    <PostBuildEvent>
+      <Command>copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
       <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>Full</Optimization>
+      <Optimization>Disabled</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>shlwapi.lib;OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
-    </Link>
+    </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
@@ -146,23 +164,26 @@
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    </Link>
+    <PostBuildEvent>
+      <Command>copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
       <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>shlwapi.lib;OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
     </Link>
@@ -350,7 +371,11 @@
     <ClCompile Include="third_party\zlib\zutil.c" />
   </ItemGroup>
   <ItemGroup>
-    <None Include="clguetzli\clguetzli.cl" />
+    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </Command>
+    </Intel_OpenCL_Build_Rules>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />
     <None Include="third_party\zlib\match32.asm" />
@@ -359,5 +384,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 12e7d8f4..308cad47 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -580,8 +580,10 @@
     <None Include="third_party\zlib\zlib.def">
       <Filter>third_party\zlib</Filter>
     </None>
-    <None Include="clguetzli\clguetzli.cl">
+  </ItemGroup>
+  <ItemGroup>
+    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
       <Filter>clguetzli</Filter>
-    </None>
+    </Intel_OpenCL_Build_Rules>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 85cd4bb7..3355265e 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -28,6 +28,7 @@
 #include "guetzli/processor.h"
 #include "guetzli/quality.h"
 #include "guetzli/stats.h"
+#include "clguetzli\clguetzli.h"
 
 namespace {
 
@@ -225,7 +226,8 @@ void Usage() {
       "                 Default value is %d.\n"
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
-      "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
+      "  --nomemlimit - Do not limit memory usage.\n"
+	  "  --opencl     - Use OpenCL\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
 
@@ -256,7 +258,11 @@ int main(int argc, char** argv) {
       memlimit_mb = atoi(argv[opt_idx]);
     } else if (!strcmp(argv[opt_idx], "--nomemlimit")) {
       memlimit_mb = -1;
-    } else if (!strcmp(argv[opt_idx], "--")) {
+	}
+	else if (!strcmp(argv[opt_idx], "--opencl")) {
+		g_useOpenCL = true;
+	}
+	else if (!strcmp(argv[opt_idx], "--")) {
       opt_idx++;
       break;
     } else {
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 26cbca23..4fb7eb21 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -69,11 +69,14 @@ static void Convolution(size_t xsize, size_t ysize,
 	float border_ratio,
 	float* __restrict__ result) {
 
-	if (xsize > 100 && ysize > 100)
+#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
+	if (g_useOpenCL && xsize > 100 && ysize > 100)
 	{
 		clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
 		return;
 	}
+#endif // ENABLE_OPENCL
+
 
   PROFILER_FUNC;
   float weight_no_border = 0;
@@ -100,8 +103,8 @@ static void Convolution(size_t xsize, size_t ysize,
     }
   }
 
-  return;
 
+#ifdef ENABLE_OPENCL_CHECK
   // for verify
   std::vector<float> tmp(xsize / xstep * ysize);
   clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, &tmp[0]);
@@ -110,9 +113,10 @@ static void Convolution(size_t xsize, size_t ysize,
   {
 	  if (fabs(result[i] - tmp[i]) > 0.0001)
 	  {
-		  tmp[i] = result[i];
+		  assert(false);
 	  }
   }
+#endif // ENABLE_OPENCL_CHECK
 }
 
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
@@ -1335,11 +1339,21 @@ double MaskDcB(double delta) {
 void MinSquareVal(size_t square_size, size_t offset,
                   size_t xsize, size_t ysize,
                   float *values) {
-
-	clMinSquareVal(square_size, offset, xsize, ysize, values);
-	return;
+#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
+	if (g_useOpenCL)
+	{
+		clMinSquareVal(square_size, offset, xsize, ysize, values);
+		return;
+	}
+#endif // ENABLE_OPENCL
 
   PROFILER_FUNC;
+
+#ifdef ENABLE_OPENCL_CHECK
+  std::vector<float> backup(xsize * ysize);
+  memcpy(&backup[0], values, xsize * ysize);
+#endif
+
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
@@ -1380,6 +1394,17 @@ void MinSquareVal(size_t square_size, size_t offset,
         *pValuePoint = min; pValuePoint += xsize;
     }
   }
+
+#ifdef ENABLE_OPENCL_CHECK
+  clMinSquareVal(square_size, offset, xsize, ysize, backup.data());
+  for (int i = 0; i < xsize * ysize; i++)
+  {
+	  if (fabs(backup[i] - values[i]) > 0.0001)
+	  {
+		  assert(false);
+	  }
+  }
+#endif
 }
 
 // ===== Functions used by Mask only =====

From 775c63c56b6b5e02cca4f666b1be68232f8cf987 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Wed, 3 May 2017 00:37:52 +0800
Subject: [PATCH 010/189] Add comment for understanding.

---
 guetzli/processor.cc | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 9986f9ed..134dfe17 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -381,10 +381,10 @@ void Processor::ComputeBlockZeroingOrder(
   static const double kWeight[3] = { 1.0, 0.22, 0.20 };
 #include "guetzli/order.inc"
   std::vector<std::pair<int, float> > input_order;
-  for (int c = 0; c < 3; ++c) {
+  for (int c = 0; c < 3; ++c) { // TOBEREMOVE:��������block��input_order,��0�Ĵ��
     if (!(comp_mask & (1 << c))) continue;
     for (int k = 1; k < kDCTBlockSize; ++k) {
-      int idx = c * kDCTBlockSize + k;
+      int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ����������
       if (block[idx] != 0) {
         float score;
         if (params_.new_zeroing_model) {
@@ -412,7 +412,7 @@ void Processor::ComputeBlockZeroingOrder(
       coeff_t candidate_block[kBlockSize];
       memcpy(candidate_block, processed_block, sizeof(candidate_block));
       const int idx = input_order[i].first;
-      candidate_block[idx] = 0;
+      candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
       for (int c = 0; c < 3; ++c) {
         if (comp_mask & (1 << c)) {
           img->component(c).SetCoeffBlock(
@@ -425,12 +425,12 @@ void Processor::ComputeBlockZeroingOrder(
           int block_xx = block_x * factor_x + ix;
           int block_yy = block_y * factor_y + iy;
           if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
-            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy));
+            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy)); // TOBEREMOVE:��ԭͼ�Ķ�Ӧblock�Ƚϣ����ش���ֵ
             max_err = std::max(max_err, err);
           }
         }
       }
-      if (max_err < best_err) {
+      if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
         best_err = max_err;
         best_i = i;
       }
@@ -438,7 +438,7 @@ void Processor::ComputeBlockZeroingOrder(
     int idx = input_order[best_i].first;
     processed_block[idx] = 0;
     input_order.erase(input_order.begin() + best_i);
-    output_order->push_back({idx, best_err});
+    output_order->push_back({idx, best_err}); // TOBEREMOVE:����������������С�����idx����Ӧ���Ա�block�еĶ�Ӧλ����������Ϊ0,�Ƴ�input_order���ѡȡ��ǰֵ������output_order,����ʽ�����õ��Ա�ͼ����ȥ��
     for (int c = 0; c < 3; ++c) {
       if (comp_mask & (1 << c)) {
         img->component(c).SetCoeffBlock(
@@ -446,6 +446,8 @@ void Processor::ComputeBlockZeroingOrder(
       }
     }
   }
+
+  // TOBEREMOVE:�����Ƴ�err������error���Ƶ���أ�����ԭ�Ա�ͼ��ԭʼֵ��
   // Make the block error values monotonic.
   float min_err = 1e10;
   for (int i = output_order->size() - 1; i >= 0; --i) {
@@ -560,7 +562,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
   candidate_coeff_errors.reserve(60 * num_blocks);
   std::vector<CoeffData> block_order;
   block_order.reserve(3 * kDCTBlockSize);
-  comparator_->StartBlockComparisons();
+  comparator_->StartBlockComparisons(); // TOBEREMOVE:��ʼ��һЩ����
   for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
     for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
       coeff_t block[kBlockSize] = { 0 };
@@ -570,25 +572,25 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
           assert(img->component(c).factor_x() == factor_x);
           assert(img->component(c).factor_y() == factor_y);
           img->component(c).GetCoeffBlock(block_x, block_y,
-                                          &block[c * kDCTBlockSize]);
+                                          &block[c * kDCTBlockSize]); // TOBEREMOVE:ȡ���Ա�ͼ��blockϵ��
           const JPEGComponent& comp = jpg.components[c];
           int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
           memcpy(&orig_block[c * kDCTBlockSize],
                  &comp.coeffs[jpg_block_ix * kDCTBlockSize],
-                 kDCTBlockSize * sizeof(orig_block[0]));
+                 kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:ȡ��ԭʼͼ��blockϵ��
         }
       }
       block_order.clear();
       ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x,
-                               factor_y, comp_mask, img, &block_order);
+                               factor_y, comp_mask, img, &block_order); // TOBEREMOVE:����ԭʼblock�ͶԱ�ͼ��block����zeroing order����block_order
       candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
-      for (size_t i = 0; i < block_order.size(); ++i) {
+      for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:�ѽ����ֵ����ѡϵ��
         candidate_coeffs.push_back(block_order[i].idx);
         candidate_coeff_errors.push_back(block_order[i].block_err);
       }
     }
   }
-  comparator_->FinishBlockComparisons();
+  comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
   candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
 
   std::vector<JpegHistogram> ac_histograms(ncomp);

From 4061ccb9cbd87d12be5e5616b7a0393e8e754dd4 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 3 May 2017 17:08:08 +0800
Subject: [PATCH 011/189] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E7=9C=8B=E4=B8=80?=
 =?UTF-8?q?=E4=B8=8B=E5=85=A8OpenCL=E5=8C=96Blur=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=EF=BC=8C=E4=B8=8D=E8=BF=87=E7=9B=AE=E5=89=8D=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E8=AF=AF=E5=B7=AE=E6=9C=89=E4=BA=9B=E5=A4=A7=EF=BC=8C=E6=98=AF?=
 =?UTF-8?q?=E5=90=A6=E6=9C=89Bug=EF=BC=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl                        | 142 ++++++++++++++++++
 clguetzli/clguetzli.cpp                       |  90 +++++++++++
 clguetzli/clguetzli.h                         |   4 +-
 clguetzli/ocl.cpp                             |   6 +-
 clguetzli/ocl.h                               |   5 +-
 guetzli.vcxproj                               |  12 +-
 .../butteraugli/butteraugli/butteraugli.cc    |  39 ++++-
 7 files changed, 284 insertions(+), 14 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 6159832d..67443e17 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -62,3 +62,145 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 
 	result[ox * ysize + y] = sum * scale;
 }
+/*
+__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
+	int len, int offset, float border_ratio)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	float weight_no_border = 0;
+	for (int j = 0; j <= 2 * offset; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+
+	int minx = x < offset ? 0 : x - offset;
+	int maxx = min(xsize, x + len - offset);
+
+	int miny = y < offset ? 0 : y - offset;
+	int maxy = min(ysize, y + len - offset);
+
+	float weightX = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		weightX += multipliers[j - x + offset];
+	}
+
+	weightX = (1.0 - border_ratio) * weightX + border_ratio * weight_no_border;
+
+	float weightY = 0.0;
+	for (int j = miny; j < maxy; j++)
+	{
+		weightY += multipliers[j - y + offset];
+	}
+
+	weightY = (1.0 - border_ratio) * weightY + border_ratio * weight_no_border;
+
+
+	float sum = 0.0;
+	for (int j = miny; j < maxy; j++)
+	{
+		float sumx = 0.0;
+		for (int i = minx; i < maxx; i++)
+		{
+			sumx += inp[j * xsize + i] * multipliers[i - x + offset];
+		}
+
+		sum += sumx * multipliers[j - y + offset];
+	}
+
+	result[y * xsize + x] = sum / weightY / weightX;
+}
+*/
+
+__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
+	int len, int offset, float border_ratio)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	float weight_no_border = 0;
+	for (int j = 0; j <= 2 * offset; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+
+	int minx = x < offset ? 0 : x - offset;
+	int maxx = min(xsize, x + len - offset);
+
+	float weight = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		weight += multipliers[j - x + offset];
+	}
+
+	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+	float scale = 1.0 / weight;
+
+	float sum = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		sum += inp[y * xsize + j] * multipliers[j - x + offset];
+	}
+
+	result[x * ysize + y] = sum * scale;
+}
+
+__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result,
+	int len, int offset, float border_ratio)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	float weight_no_border = 0;
+	for (int j = 0; j <= 2 * offset; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+
+	int miny = y < offset ? 0 : y - offset;
+	int maxy = min(ysize, y + len - offset);
+
+	float weight = 0.0;
+	for (int j = miny; j < maxy; j++)
+	{
+		weight += multipliers[j - y + offset];
+	}
+
+	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+	float scale = 1.0 / weight;
+
+	float sum = 0.0;
+	for (int j = miny; j < maxy; j++)
+	{
+		sum += inp[j * xsize + x] * multipliers[j - y + offset];
+	}
+
+	result[y * xsize + x] = sum * scale;
+}
+
+__kernel void DownSample(__global float* pA, __global float* pC, int square)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	const int oxsize = xsize / square;
+
+	const int sample_x = x / square;
+	const int sample_y = y / square;
+
+	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 5db62cc2..bbd578ad 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1,3 +1,6 @@
+#include <math.h>
+#include <algorithm>
+#include <vector>
 #include "clguetzli.h"
 #include "ocl.h"
 
@@ -40,6 +43,9 @@ ocl_args_d_t& getOcl(void)
 	{
 		LogError("Error: clCreateKernel(Convolution) for source program returned %s.\n", TranslateOpenCLError(err));
 	}
+	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err);
+	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
+	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
 
 	return ocl;
 }
@@ -151,4 +157,88 @@ void clConvolution(size_t xsize, size_t ysize,
 	}
 
 	memcpy(result, resultPtr, sizeof(cl_float) * oxsize * ysize);
+}
+
+void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio)
+{
+	double m = 2.25;  // Accuracy increases when m is increased.
+	const double scaler = -1.0 / (2 * sigma * sigma);
+	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+	const int diff = std::max<int>(1, m * fabs(sigma));
+	const int expn_size = 2 * diff + 1;
+	std::vector<float> expn(expn_size);
+	for (int i = -diff; i <= diff; ++i) {
+		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+	}
+
+	const int xstep = std::max<int>(1, int(sigma / 3));
+
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	ocl.allocA(sizeof(cl_float) * expn_size);
+	ocl.allocB(sizeof(cl_float) * xsize * ysize);
+	ocl.allocC(sizeof(cl_float) * xsize * ysize);
+
+	memcpy(ocl.inputA, expn.data(), sizeof(cl_float) * expn_size);
+	memcpy(ocl.inputB, channel, sizeof(cl_float) * xsize * ysize);
+
+	cl_int clxsize = xsize;
+	cl_int clxstep = xstep;
+	cl_int cllen = expn_size;
+	cl_int cloffset = diff;
+	cl_float clborder_ratio = border_ratio;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
+
+	size_t globalWorkSize[2] = { xsize / xstep, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	globalWorkSize[0] = ysize / xstep;
+	globalWorkSize[1] = xsize / xstep;
+	clxsize = ysize;
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.srcB);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
+
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	cl_int clstep = xstep;
+	if (clstep <= 1)
+	{
+		cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.srcB, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
+		err = clFinish(ocl.commandQueue);
+		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
+	}
+	else
+	{
+		kernel = ocl.kernel[KERNEL_DOWNSAMPLE];
+		clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB);
+		clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
+		clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clstep);
+
+		globalWorkSize[0] = ysize;
+		globalWorkSize[1] = xsize;
+		err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+		err = clFinish(ocl.commandQueue);
+
+		cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
+		err = clFinish(ocl.commandQueue);
+		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
+	}
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index df3dbc1d..e918b0d9 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -12,4 +12,6 @@ void clConvolution(size_t xsize, size_t ysize,
 	const float* multiplier,
 	const float* inp,
 	float border_ratio,
-	float* result);
\ No newline at end of file
+	float* result);
+
+void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio);
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 50d3ad6c..3dd34e80 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -138,7 +138,7 @@ void* ocl_args_d_t::allocA(size_t s)
 	lenA = s;
 
 	cl_int err = 0;
-	srcA = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputA, &err);
+	srcA = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputA, &err);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: allocA() for buffer returned %s.\n", TranslateOpenCLError(err));
@@ -159,7 +159,7 @@ void* ocl_args_d_t::allocB(size_t s)
 	lenB = s;
 
 	cl_int err = 0;
-	srcB = clCreateBuffer(this->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, s, inputB, &err);
+	srcB = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputB, &err);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err));
@@ -180,7 +180,7 @@ void* ocl_args_d_t::allocC(size_t s)
 	lenC = s;
 
 	cl_int err = 0;
-	dstMem = clCreateBuffer(this->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, s, outputC, &err);
+	dstMem = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, outputC, &err);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err));
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 5f21a0e3..f8a86045 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -45,8 +45,11 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 
 #define KERNEL_MINSQUAREVAL 0
 #define KERNEL_CONVOLUTION 1
+#define KERNEL_CONVOLUTIONX 2
+#define KERNEL_CONVOLUTIONY 3
+#define KERNEL_DOWNSAMPLE 4
 
-#define KERNEL_COUNT 2
+#define KERNEL_COUNT 5
 
 struct ocl_args_d_t
 {
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index fb32ae0f..3aa98abf 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -108,6 +108,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
+      <PreprocessorDefinitions>ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -157,11 +158,12 @@
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
       <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 4fb7eb21..dbcac422 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -68,7 +68,7 @@ static void Convolution(size_t xsize, size_t ysize,
 	const float* __restrict__ inp,
 	float border_ratio,
 	float* __restrict__ result) {
-
+/*
 #if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
 	if (g_useOpenCL && xsize > 100 && ysize > 100)
 	{
@@ -76,7 +76,7 @@ static void Convolution(size_t xsize, size_t ysize,
 		return;
 	}
 #endif // ENABLE_OPENCL
-
+*/
 
   PROFILER_FUNC;
   float weight_no_border = 0;
@@ -103,7 +103,7 @@ static void Convolution(size_t xsize, size_t ysize,
     }
   }
 
-
+  /*
 #ifdef ENABLE_OPENCL_CHECK
   // for verify
   std::vector<float> tmp(xsize / xstep * ysize);
@@ -117,10 +117,24 @@ static void Convolution(size_t xsize, size_t ysize,
 	  }
   }
 #endif // ENABLE_OPENCL_CHECK
+*/
 }
 
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
+
+#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
+	if (g_useOpenCL && xsize > 100 && ysize > 100)
+	{
+		clBlur(xsize, ysize, channel, sigma, border_ratio);
+		return;
+	}
+#endif // ENABLE_OPENCL
+#ifdef ENABLE_OPENCL_CHECK
+	std::vector<float> tmpChannel(xsize  * ysize);
+	memcpy(tmpChannel.data(), channel, xsize * ysize * sizeof(float));
+#endif
+
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
   const double scaler = -1.0 / (2 * sigma * sigma);
@@ -156,6 +170,23 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
       }
     }
   }
+
+#ifdef ENABLE_OPENCL_CHECK
+  // for verify
+  {
+	  if (xsize < 100 || ysize < 100) return;
+
+	  clBlur(xsize, ysize, tmpChannel.data(), sigma, border_ratio);
+
+	  for (int i = 0; i < xsize * ysize; i++)
+	  {
+		  if (fabs(channel[i] - tmpChannel[i]) > 0.0001)
+		  {
+			  float k = channel[i] - tmpChannel[i];
+		  }
+	  }
+  }
+#endif // ENABLE_OPENCL_CHECK
 }
 
 // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
@@ -1351,7 +1382,7 @@ void MinSquareVal(size_t square_size, size_t offset,
 
 #ifdef ENABLE_OPENCL_CHECK
   std::vector<float> backup(xsize * ysize);
-  memcpy(&backup[0], values, xsize * ysize);
+  memcpy(&backup[0], values, xsize * ysize * sizeof(float));
 #endif
 
   // offset is not negative and smaller than square_size.

From d9a87afad06f2e6933cd8d6f5b768e16b4a7b1f9 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 4 May 2017 09:43:13 +0800
Subject: [PATCH 012/189] add opencl process line

---
 clguetzli/clguetzli.cl  |  27 +++++-
 clguetzli/clguetzli.cpp | 176 +++++++++++++++++++++++++++++++++++++++-
 clguetzli/clguetzli.h   |   6 +-
 clguetzli/ocl.cpp       |  12 +++
 clguetzli/ocl.h         |   2 +
 5 files changed, 218 insertions(+), 5 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 67443e17..af70a10b 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,3 +1,11 @@
+float minfun(float a, float b)
+{
+	if (a < b)
+		return a;
+	else
+		return b;
+}
+
 __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
 {
 	const int x = get_global_id(0);
@@ -17,14 +25,25 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 	{
 		for (int i = minW; i < maxW; i++)
 		{
-			float tmp = pA[j * width + i];
-			if (tmp < minValue) minValue = tmp;
+			minValue = minfun(minValue, pA[j * width + i]);
+//			float tmp = pA[j * width + i];
+//			if (tmp < minValue) minValue = tmp;
 		}
 	}
 
 	pC[y * width + x] = minValue;
 }
 
+float calcWeight(__global float* multipliers, int len)
+{
+	float weight_no_border = 0;
+	for (int j = 0; j < len; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+	return weight_no_border;
+}
+
 __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result,
 							int xsize, int xstep, int len, int offset, float border_ratio)
 {
@@ -35,12 +54,14 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 	const int ysize = get_global_size(1);
 
 	const int x = ox * xstep;
-
+/*
 	float weight_no_border = 0;
 	for (int j = 0; j <= 2 * offset; j++)
 	{
 		weight_no_border += multipliers[j];
 	}
+*/
+	float weight_no_border = calcWeight(multipliers, len);
 
 	int minx = x < offset ? 0 : x - offset;
 	int maxx = min(xsize, x + len - offset);
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index bbd578ad..ea8ff091 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -241,4 +241,178 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor
 		err = clFinish(ocl.commandQueue);
 		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
 	}
-}
\ No newline at end of file
+}
+
+void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize, cl_mem expn, size_t expn_size,
+	int step, int offset, double border_ratio, cl_mem result)
+{
+	// Convolution
+}
+
+void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, cl_mem result)
+{
+/*
+	for (size_t y = 0; y < ysize; y++) {
+		for (size_t x = 0; x < xsize; x++) {
+			// TODO: Use correct rounding.
+			channel[y * xsize + x] =
+				downsampled_output[(y / ystep) * dxsize + (x / xstep)];
+		}
+	}
+*/
+}
+
+void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio)
+{
+	double m = 2.25;  // Accuracy increases when m is increased.
+	const double scaler = -1.0 / (2 * sigma * sigma);
+	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+	const int diff = std::max<int>(1, m * fabs(sigma));
+	const int expn_size = 2 * diff + 1;
+	std::vector<float> expn(expn_size);
+	for (int i = -diff; i <= diff; ++i) {
+		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+	}
+
+	const int xstep = std::max<int>(1, int(sigma / 3));
+	const int ystep = xstep;
+	int dxsize = (xsize + xstep - 1) / xstep;
+	int dysize = (ysize + ystep - 1) / ystep;
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	if (xstep > 1)
+	{
+		ocl.allocA(sizeof(cl_float) * dxsize * ysize);
+		ocl.allocB(sizeof(cl_float) * dxsize * dysize);
+
+		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
+		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB);
+		clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, image);
+	}
+	else
+	{
+		ocl.allocA(sizeof(cl_float) * xsize * ysize);
+		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
+		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, image);
+	}
+
+	clReleaseMemObject(mem_expn);
+}
+
+void clOpsinDynamicsImageEx(cl_mem r, cl_mem g, cl_mem b, size_t size)
+{
+/*
+	for (size_t i = 0; i < rgb[0].size(); ++i) {
+		double sensitivity[3];
+		{
+			// Calculate sensitivity[3] based on the smoothed image gamma derivative.
+			double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] };
+			double pre_mixed[3];
+			OpsinAbsorbance(pre_rgb, pre_mixed);
+			sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+			sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+			sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+		}
+		double cur_rgb[3] = { rgb[0][i],  rgb[1][i],  rgb[2][i] };
+		double cur_mixed[3];
+		OpsinAbsorbance(cur_rgb, cur_mixed);
+		cur_mixed[0] *= sensitivity[0];
+		cur_mixed[1] *= sensitivity[1];
+		cur_mixed[2] *= sensitivity[2];
+		double x, y, z;
+		RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+		rgb[0][i] = static_cast<float>(x);
+		rgb[1][i] = static_cast<float>(y);
+		rgb[2][i] = static_cast<float>(z);
+*/
+}
+
+void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b)
+{
+	static const double kSigma = 1.1;
+
+	cl_int channel_size = xsize * ysize * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem mem_r = ocl.allocMem(channel_size);
+	cl_mem mem_g = ocl.allocMem(channel_size);
+	cl_mem mem_b = ocl.allocMem(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clBlurEx(mem_r, xsize, ysize, kSigma, 0.0);
+	clBlurEx(mem_g, xsize, ysize, kSigma, 0.0);
+	clBlurEx(mem_b, xsize, ysize, kSigma, 0.0);
+
+	clOpsinDynamicsImageEx(mem_r, mem_g, mem_b, xsize * ysize);
+
+	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	memcpy(r, result_r, channel_size);
+	memcpy(g, result_g, channel_size);
+	memcpy(b, result_b, channel_size);
+
+	clReleaseMemObject(mem_r);
+	clReleaseMemObject(mem_g);
+	clReleaseMemObject(mem_b);
+}
+
+void clMaskHighIntensityChangeEx(cl_mem r, cl_mem g, cl_mem b,
+								 cl_mem r2, cl_mem g2,cl_mem b2,
+								 size_t xsize, size_t ysize)
+{
+	// MaskHighIntensityChange
+}
+
+void clEdgeDetectorMap(cl_mem r, cl_mem g, cl_mem b,
+	cl_mem r2, cl_mem g2, cl_mem b2,
+	size_t xsize, size_t ysize)
+{
+	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+	clBlurEx(r, xsize, ysize, kSigma[0], 0.0);
+	clBlurEx(r2, xsize, ysize, kSigma[0], 0.0);
+	clBlurEx(g, xsize, ysize, kSigma[1], 0.0);
+	clBlurEx(r2, xsize, ysize, kSigma[1], 0.0);
+	clBlurEx(b, xsize, ysize, kSigma[2], 0.0);
+	clBlurEx(b2, xsize, ysize, kSigma[2], 0.0);
+}
+void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
+								 float* r2, float* g2, float* b2,
+								 size_t xsize, size_t ysize,
+								 float* result)
+{
+
+	cl_int channel_size = xsize * ysize * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem mem_r = ocl.allocMem(channel_size);
+	cl_mem mem_g = ocl.allocMem(channel_size);
+	cl_mem mem_b = ocl.allocMem(channel_size);
+	cl_mem mem_r2 = ocl.allocMem(channel_size);
+	cl_mem mem_g2 = ocl.allocMem(channel_size);
+	cl_mem mem_b2 = ocl.allocMem(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_r2, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_g2, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_b2, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clMaskHighIntensityChangeEx(mem_r, mem_g, mem_b, mem_r2, mem_g2, mem_b2, xsize, ysize);
+}
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index e918b0d9..fa489667 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,7 +1,11 @@
 #pragma once
-
+#include "CL\cl.h"
 extern bool g_useOpenCL;
 
+void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float& b);
+
+void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio);
+
 void clMinSquareVal(size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	float *values);
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 3dd34e80..36fc4041 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -189,6 +189,18 @@ void* ocl_args_d_t::allocC(size_t s)
 	return outputC;
 }
 
+cl_mem ocl_args_d_t::allocMem(size_t s)
+{
+	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
+	cl_int err = 0;
+	cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+	return mem;
+}
+
 const char* TranslateOpenCLError(cl_int errorCode)
 {
 	switch (errorCode)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index f8a86045..04bf5b1d 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -60,6 +60,8 @@ struct ocl_args_d_t
 	void* allocB(size_t s);
 	void* allocC(size_t s);
 
+	cl_mem allocMem(size_t s);
+
 	// Regular OpenCL objects:
 	cl_context       context;           // hold the context handler
 	cl_device_id     device;            // hold the selected device handler

From b6188431f0839d5e0608ecc746412bac11f6c4fb Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 4 May 2017 09:45:05 +0800
Subject: [PATCH 013/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 third_party/butteraugli/butteraugli/butteraugli.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index dbcac422..fb895b34 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1381,8 +1381,7 @@ void MinSquareVal(size_t square_size, size_t offset,
   PROFILER_FUNC;
 
 #ifdef ENABLE_OPENCL_CHECK
-  std::vector<float> backup(xsize * ysize);
-  memcpy(&backup[0], values, xsize * ysize * sizeof(float));
+  std::vector<float> backup(values, values + xsize * ysize);
 #endif
 
   // offset is not negative and smaller than square_size.

From 0ba681731a7bf7f9c02cbdd003391f8979bf3a23 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 4 May 2017 14:09:31 +0800
Subject: [PATCH 014/189] add function

---
 clguetzli/clguetzli.cpp | 94 +++++++++++++++++++++++++++++------------
 clguetzli/ocl.cpp       | 25 +++++++++++
 clguetzli/ocl.h         |  8 ++++
 3 files changed, 101 insertions(+), 26 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index ea8ff091..21f5cc73 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -370,25 +370,55 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	clReleaseMemObject(mem_b);
 }
 
-void clMaskHighIntensityChangeEx(cl_mem r, cl_mem g, cl_mem b,
-								 cl_mem r2, cl_mem g2,cl_mem b2,
-								 size_t xsize, size_t ysize)
+void clMaskHighIntensityChangeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize)
 {
 	// MaskHighIntensityChange
 }
 
-void clEdgeDetectorMap(cl_mem r, cl_mem g, cl_mem b,
-	cl_mem r2, cl_mem g2, cl_mem b2,
-	size_t xsize, size_t ysize)
+void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result)
 {
 	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
-	clBlurEx(r, xsize, ysize, kSigma[0], 0.0);
-	clBlurEx(r2, xsize, ysize, kSigma[0], 0.0);
-	clBlurEx(g, xsize, ysize, kSigma[1], 0.0);
-	clBlurEx(r2, xsize, ysize, kSigma[1], 0.0);
-	clBlurEx(b, xsize, ysize, kSigma[2], 0.0);
-	clBlurEx(b2, xsize, ysize, kSigma[2], 0.0);
+	clBlurEx(rgb.r,  xsize, ysize, kSigma[0], 0.0);
+	clBlurEx(rgb2.r, xsize, ysize, kSigma[0], 0.0);
+	clBlurEx(rgb.g,  xsize, ysize, kSigma[1], 0.0);
+	clBlurEx(rgb2.g, xsize, ysize, kSigma[1], 0.0);
+	clBlurEx(rgb.b,  xsize, ysize, kSigma[2], 0.0);
+	clBlurEx(rgb2.b, xsize, ysize, kSigma[2], 0.0);
+
+	// EdgeDetectorLowFreq
 }
+
+void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
+	size_t xsize, size_t ysize,
+	cl_mem block_diff_dc, cl_mem block_diff_ac)
+{
+
+}
+
+void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
+	size_t xsize, size_t ysize,
+	cl_mem block_diff_ac)
+{
+
+}
+
+void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
+	size_t xsize, size_t ysize,
+	ocl_channels mask, ocl_channels mask_dc)
+{
+
+}
+
+void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, cl_mem result)
+{
+
+}
+
+void clCalculateDiffmap(cl_mem result, size_t xsize, size_t ysize, int step)
+{
+
+}
+
 void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 								 float* r2, float* g2, float* b2,
 								 size_t xsize, size_t ysize,
@@ -399,20 +429,32 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem mem_r = ocl.allocMem(channel_size);
-	cl_mem mem_g = ocl.allocMem(channel_size);
-	cl_mem mem_b = ocl.allocMem(channel_size);
-	cl_mem mem_r2 = ocl.allocMem(channel_size);
-	cl_mem mem_g2 = ocl.allocMem(channel_size);
-	cl_mem mem_b2 = ocl.allocMem(channel_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_r2, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_g2, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_b2, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+	ocl_channels xyb = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb2 = ocl.allocMemChannels(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	clMaskHighIntensityChangeEx(mem_r, mem_g, mem_b, mem_r2, mem_g2, mem_b2, xsize, ysize);
+	clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize);
+
+	cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize);
+	cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize);
+	cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize);
+
+	ocl_channels mask;
+	ocl_channels mask_dc;
+
+	cl_mem mem_result;
+
+	clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, edge_detector_map);
+	clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac);
+	clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac);
+
+	clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc);
+	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, mem_result);
 }
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 36fc4041..8272ac8c 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -201,6 +201,31 @@ cl_mem ocl_args_d_t::allocMem(size_t s)
 	return mem;
 }
 
+ocl_channels ocl_args_d_t::allocMemChannels(size_t s)
+{
+	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
+	cl_int err = 0;
+
+	ocl_channels img;
+	img.r = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocMemR() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+	img.g = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocMemG() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+	img.b = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: allocMemB() for buffer returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	return img;
+}
+
 const char* TranslateOpenCLError(cl_int errorCode)
 {
 	switch (errorCode)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 04bf5b1d..0161d1a1 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -51,6 +51,13 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 
 #define KERNEL_COUNT 5
 
+struct ocl_channels
+{
+	cl_mem r;
+	cl_mem g;
+	cl_mem b;
+};
+
 struct ocl_args_d_t
 {
 	ocl_args_d_t();
@@ -61,6 +68,7 @@ struct ocl_args_d_t
 	void* allocC(size_t s);
 
 	cl_mem allocMem(size_t s);
+	ocl_channels allocMemChannels(size_t s);
 
 	// Regular OpenCL objects:
 	cl_context       context;           // hold the context handler

From d4c9ed96b2ccb96dc8e960ddc47a475f3e55940c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 4 May 2017 17:01:08 +0800
Subject: [PATCH 015/189] =?UTF-8?q?=E6=90=AD=E5=BB=BA=20clDiffmapOpsinDyna?=
 =?UTF-8?q?micsImage=20=E7=9A=84=E8=AE=A1=E7=AE=97=E6=B5=81=E7=A8=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp | 230 ++++++++++++++++++++++++++++++++++------
 clguetzli/clguetzli.h   |   4 +-
 clguetzli/ocl.cpp       |  31 +++---
 clguetzli/ocl.h         |  16 ++-
 4 files changed, 223 insertions(+), 58 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 21f5cc73..2ff242c4 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -242,14 +242,20 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor
 		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
 	}
 }
-
-void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize, cl_mem expn, size_t expn_size,
-	int step, int offset, double border_ratio, cl_mem result)
+//=========================================================
+// ian todo
+void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize,
+				     cl_mem expn, size_t expn_size,
+                     int step, int offset, double border_ratio,
+                     cl_mem result/*out*/)
 {
 	// Convolution
 }
 
-void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t ystep, cl_mem result)
+// ian todo
+void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
+                  size_t xstep, size_t ystep,
+                  cl_mem result/*out*/)
 {
 /*
 	for (size_t y = 0; y < ysize; y++) {
@@ -262,7 +268,9 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize, size_t xstep, size_t
 */
 }
 
-void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio)
+void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
+              double sigma, double border_ratio,
+              cl_mem result/*out, opt*/)
 {
 	double m = 2.25;  // Accuracy increases when m is increased.
 	const double scaler = -1.0 / (2 * sigma * sigma);
@@ -293,19 +301,20 @@ void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double bor
 
 		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
 		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB);
-		clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, image);
+		clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, result ? result : image);
 	}
 	else
 	{
 		ocl.allocA(sizeof(cl_float) * xsize * ysize);
 		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
-		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, image);
+		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, result ? result : image);
 	}
 
 	clReleaseMemObject(mem_expn);
 }
 
-void clOpsinDynamicsImageEx(cl_mem r, cl_mem g, cl_mem b, size_t size)
+// ian todo
+void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t size)
 {
 /*
 	for (size_t i = 0; i < rgb[0].size(); ++i) {
@@ -333,6 +342,7 @@ void clOpsinDynamicsImageEx(cl_mem r, cl_mem g, cl_mem b, size_t size)
 */
 }
 
+// strong todo
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b)
 {
 	static const double kSigma = 1.1;
@@ -341,41 +351,41 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem mem_r = ocl.allocMem(channel_size);
-	cl_mem mem_g = ocl.allocMem(channel_size);
-	cl_mem mem_b = ocl.allocMem(channel_size);
+    ocl_channels rgb = ocl.allocMemChannels(channel_size);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	clBlurEx(mem_r, xsize, ysize, kSigma, 0.0);
-	clBlurEx(mem_g, xsize, ysize, kSigma, 0.0);
-	clBlurEx(mem_b, xsize, ysize, kSigma, 0.0);
+	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0);
+	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0);
+	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0);
 
-	clOpsinDynamicsImageEx(mem_r, mem_g, mem_b, xsize * ysize);
+	clOpsinDynamicsImageEx(rgb, xsize * ysize);
 
-	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-	cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-	cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
 	memcpy(r, result_r, channel_size);
 	memcpy(g, result_g, channel_size);
 	memcpy(b, result_b, channel_size);
 
-	clReleaseMemObject(mem_r);
-	clReleaseMemObject(mem_g);
-	clReleaseMemObject(mem_b);
+    ocl.releaseMemChannels(rgb);
 }
 
-void clMaskHighIntensityChangeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize)
+// ian todo
+void clMaskHighIntensityChangeEx(ocl_channels rgb/*in,out*/,
+                                 ocl_channels rgb2/*in,out*/,
+                                 size_t xsize, size_t ysize)
 {
 	// MaskHighIntensityChange
 }
 
-void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result)
+// strong todo
+void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result/*out*/)
 {
 	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
 	clBlurEx(rgb.r,  xsize, ysize, kSigma[0], 0.0);
@@ -388,37 +398,184 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size
 	// EdgeDetectorLowFreq
 }
 
+// strong todo
 void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
-	cl_mem block_diff_dc, cl_mem block_diff_ac)
+	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/)
 {
 
 }
 
+// strong todo
 void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
-	cl_mem block_diff_ac)
+	cl_mem block_diff_ac/*out*/)
+{
+	static const double kSigma = 14;
+	static const double kMul = 10;
+
+	clBlurEx(rgb.r, xsize, ysize,  kSigma, 0.0);
+	clBlurEx(rgb2.r, xsize, ysize, kSigma, 0.0);
+	clBlurEx(rgb.g, xsize, ysize,  kSigma, 0.0);
+	clBlurEx(rgb2.g, xsize, ysize, kSigma, 0.0);
+	clBlurEx(rgb.b, xsize, ysize,  kSigma, 0.0);
+	clBlurEx(rgb2.b, xsize, ysize, kSigma, 0.0);
+}
+
+// ian todo
+void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/)
+{
+
+}
+
+// ian todo
+void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/)
+{
+
+}
+
+// ian todo
+void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
+{
+    static const float w = 0.679144890667f;
+    static const float scale = 1.0f / (5.0f + 4 * w);
+
+    cl_mem tmp0;
+    cl_mem tmp1;
+    clScaleImageEx(img, xsize * ysize, w, tmp0);
+    clScaleImageEx(img, xsize * ysize, 1, tmp1);
+    // average5x5 calc
+
+    clScaleImageEx(img, xsize * ysize, scale, img);
+}
+
+//
+void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset)
 {
 
 }
 
+static const double kInternalGoodQualityThreshold = 14.921561160295326;
+static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+// ian todo
 void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
-	ocl_channels mask, ocl_channels mask_dc)
+	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/)
 {
-
+    clDiffPrecomputeEx(rgb, rgb2, xsize, ysize, mask);
+    for (int i = 0; i < 3; i++)
+    {
+        clAverage5x5Ex(mask.ch[i], xsize, ysize);
+        clMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0);
+
+        static const double sigma[3] = {
+            9.65781083553,
+            14.2644604355,
+            4.53358927369,
+        };
+
+        clBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
+    }
+/*
+    static const double w00 = 232.206464018;
+    static const double w11 = 22.9455222245;
+    static const double w22 = 503.962310606;
+
+    mask_dc->resize(3);
+    for (int i = 0; i < 3; ++i) {
+        (*mask_dc)[i].resize(xsize * ysize);
+    }
+    for (size_t y = 0; y < ysize; ++y) {
+        for (size_t x = 0; x < xsize; ++x) {
+            const size_t idx = y * xsize + x;
+            const double s0 = (*mask)[0][idx];
+            const double s1 = (*mask)[1][idx];
+            const double s2 = (*mask)[2][idx];
+            const double p0 = w00 * s0;
+            const double p1 = w11 * s1;
+            const double p2 = w22 * s2;
+
+            (*mask)[0][idx] = static_cast<float>(MaskX(p0));
+            (*mask)[1][idx] = static_cast<float>(MaskY(p1));
+            (*mask)[2][idx] = static_cast<float>(MaskB(p2));
+            (*mask_dc)[0][idx] = static_cast<float>(MaskDcX(p0));
+            (*mask_dc)[1][idx] = static_cast<float>(MaskDcY(p1));
+            (*mask_dc)[2][idx] = static_cast<float>(MaskDcB(p2));
+        }
+    }
+*/
+    for (int i = 0; i < 3; i++)
+    {
+        clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask.ch[i]);
+        clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask_dc.ch[i]);
+    }
 }
 
-void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, cl_mem result)
+// ian todo
+void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, size_t step, cl_mem result/*out*/)
 {
 
 }
 
-void clCalculateDiffmap(cl_mem result, size_t xsize, size_t ysize, int step)
+// strong todo
+void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, int step)
 {
+/*
+    int s2 = (8 - step) / 2;
+    {
+        // Upsample and take square root.
+        std::vector<float> diffmap_out(xsize * ysize);
+        const size_t res_xsize = (xsize + step - 1) / step;
+        for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) {
+            for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) {
+                size_t res_ix = (res_y * res_xsize + res_x) / step;
+                float orig_val = (*diffmap)[res_ix];
+                constexpr float kInitialSlope = 100;
+                // TODO(b/29974893): Until that is fixed do not call sqrt on very small
+                // numbers.
+                double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+                    ? kInitialSlope * orig_val
+                    : std::sqrt(orig_val);
+                for (size_t off_y = 0; off_y < step; ++off_y) {
+                    for (size_t off_x = 0; off_x < step; ++off_x) {
+                        diffmap_out[(res_y + off_y + s2) * xsize +
+                            res_x + off_x + s2] = val;
+                    }
+                }
+            }
+        }
+        *diffmap = diffmap_out;
+    }
+*/
+    static const double kSigma = 8.8510880283;
+    static const double mul1 = 24.8235314874;
+    static const double scale = 1.0 / (1.0 + mul1);
+    const int s = 8 - step;
+    const int s2 = (8 - step) / 2;
 
+    cl_mem blurred;
+/*
+    for (size_t y = 0; y < ysize - s; ++y) {
+    for (size_t x = 0; x < xsize - s; ++x) {
+    blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2];
+    }
+    }
+*/
+    static const double border_ratio = 0.03027655136;
+    clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
+/*
+    for (size_t y = 0; y < ysize - s; ++y) {
+    for (size_t x = 0; x < xsize - s; ++x) {
+    (*diffmap)[(y + s2) * xsize + x + s2]
+    += static_cast<float>(mul1) * blurred[y * (xsize - s) + x];
+    }
+    }
+*/
+    clScaleImageEx(result, xsize * ysize, scale, result);
 }
 
+// strong todo
 void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 								 float* r2, float* g2, float* b2,
 								 size_t xsize, size_t ysize,
@@ -440,8 +597,6 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize);
-
 	cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize);
 	cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize);
 	cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize);
@@ -451,10 +606,15 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 
 	cl_mem mem_result;
 
+	clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize);
+
 	clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, edge_detector_map);
 	clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac);
 	clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac);
 
+    int step = 4;
 	clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc);
-	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, mem_result);
+	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, step, mem_result);
+
+    clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 }
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index fa489667..edaa0688 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -2,9 +2,7 @@
 #include "CL\cl.h"
 extern bool g_useOpenCL;
 
-void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float& b);
-
-void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio);
+void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
 
 void clMinSquareVal(size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 8272ac8c..26c68f5c 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -207,25 +207,26 @@ ocl_channels ocl_args_d_t::allocMemChannels(size_t s)
 	cl_int err = 0;
 
 	ocl_channels img;
-	img.r = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocMemR() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
-	img.g = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocMemG() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
-	img.b = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocMemB() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
+    for (int i = 0; i < 3; i++)
+    {
+        img.ch[i] = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
+        if (CL_SUCCESS != err)
+        {
+            LogError("Error: allocMemChannel(%d) for buffer returned %s.\n", i, TranslateOpenCLError(err));
+        }
+    }
 
 	return img;
 }
 
+void ocl_args_d_t::releaseMemChannels(ocl_channels rgb)
+{
+    for (int i = 0; i < 3; i++)
+    {
+        clReleaseMemObject(rgb.ch[i]);
+    }
+}
+
 const char* TranslateOpenCLError(cl_int errorCode)
 {
 	switch (errorCode)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 0161d1a1..5eb19560 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -51,12 +51,17 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 
 #define KERNEL_COUNT 5
 
-struct ocl_channels
+typedef union ocl_channels_t
 {
-	cl_mem r;
-	cl_mem g;
-	cl_mem b;
-};
+    struct
+    {
+        cl_mem r;
+        cl_mem g;
+        cl_mem b;
+    };
+
+    cl_mem ch[3];
+}ocl_channels;
 
 struct ocl_args_d_t
 {
@@ -69,6 +74,7 @@ struct ocl_args_d_t
 
 	cl_mem allocMem(size_t s);
 	ocl_channels allocMemChannels(size_t s);
+    void releaseMemChannels(ocl_channels rgb);
 
 	// Regular OpenCL objects:
 	cl_context       context;           // hold the context handler

From dba4c851828cdb34cba2fe9240b6778559636d67 Mon Sep 17 00:00:00 2001
From: ianuming <uming.zelda@gmail.com>
Date: Thu, 4 May 2017 19:50:38 +0800
Subject: [PATCH 016/189] Convert OpsinDynamicsImage to opencl

---
 clguetzli/clguetzli.cl                        | 101 ++++++++++++++++++
 clguetzli/clguetzli.cpp                       |  59 +++++-----
 clguetzli/ocl.cpp                             |   3 +-
 clguetzli/ocl.h                               |  16 +--
 .../butteraugli/butteraugli/butteraugli.cc    |  78 ++++++++++++++
 5 files changed, 221 insertions(+), 36 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index af70a10b..68f7eff0 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -224,4 +224,105 @@ __kernel void DownSample(__global float* pA, __global float* pC, int square)
 	const int sample_y = y / square;
 
 	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
+}
+
+void OpsinAbsorbance(const double in[3], double out[3])
+{
+	const float mix[12] = {
+	0.348036746003,
+	0.577814843137,
+	0.0544556093735,
+	0.774145581713,
+	0.26922717275,
+	0.767247733938,
+	0.0366922708552,
+	0.920130265014,
+	0.0882062883536,
+	0.158581714673,
+	0.712857943858,
+	10.6524069248,
+	};
+
+	out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3];
+	out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7];
+	out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11];
+}
+
+double EvaluatePolynomial(const double x, const double *coefficients, int n)
+{
+	double b1 = 0.0;
+	double b2 = 0.0;
+
+	for (int i = n - 1; i >= 0; i--)
+	{
+		if (i == 0) {
+			const double x_b1 = x * b1;
+			b1 = x_b1 - b2 + coefficients[0];
+			break;
+		}
+		const double x_b1 = x * b1;
+		const double t = (x_b1 + x_b1) - b2 + coefficients[i];
+		b2 = b1;
+		b1 = t;
+	}
+
+	return b1;
+}
+
+float Gamma(double v)
+{
+	double min_value = 0.770000000000000;
+	double max_value = 274.579999999999984;
+
+	static const double p[5 + 1] = {
+		881.979476556478289, 1496.058452015812463, 908.662212739659481,
+		373.566100223287378, 85.840860336314364, 6.683258861509244,
+	};
+	static const double q[5 + 1] = {
+		12.262350348616792, 20.557285797683576, 12.161463238367844,
+		4.711532733641639, 0.899112889751053, 0.035662329617191,
+	};
+
+	const double x01 = (v - min_value) / (max_value - min_value);
+	const double xc = 2.0 * x01 - 1.0;
+
+	const double yp = EvaluatePolynomial(xc, p, 6);
+	const double yq = EvaluatePolynomial(xc, q, 6);
+	if (yq == 0.0) return 0.0;
+	return (float)(yp / yq);
+}
+
+void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
+{
+	static const double a0 = 1.01611726948;
+	static const double a1 = 0.982482243696;
+	static const double a2 = 1.43571362627;
+	static const double a3 = 0.896039849412;
+	*valx = a0 * r - a1 * g;
+	*valy = a2 * r + a3 * g;
+	*valz = b;
+}
+
+__kernel void OpsinDynamicsImage(__global float *r, __global float *g, __global float *b, __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, int size)
+{
+	const int i = get_global_id(0);
+	double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
+	double pre_mixed[3];
+	OpsinAbsorbance(pre, pre_mixed);
+	double sensitivity[3];
+	sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+	sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+	sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+
+	double cur_rgb[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
+	double cur_mixed[3];
+    OpsinAbsorbance(cur_rgb, cur_mixed);
+    cur_mixed[0] *= sensitivity[0];
+    cur_mixed[1] *= sensitivity[1];
+    cur_mixed[2] *= sensitivity[2];
+    double x, y, z;
+	RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+    r[i] = x;
+    g[i] = y;
+    b[i] = z;
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 2ff242c4..6b7b2036 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void)
 
 	char* source = nullptr;
 	size_t src_size = 0;
-	ReadSourceFromFile("clguetzli.cl", &source, &src_size);
+	ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size);
 
 	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
 
@@ -46,6 +46,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err);
 	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
 	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
+	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
 
 	return ocl;
 }
@@ -314,32 +315,30 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 }
 
 // ian todo
-void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t size)
+void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred, size_t size)
 {
-/*
-	for (size_t i = 0; i < rgb[0].size(); ++i) {
-		double sensitivity[3];
-		{
-			// Calculate sensitivity[3] based on the smoothed image gamma derivative.
-			double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] };
-			double pre_mixed[3];
-			OpsinAbsorbance(pre_rgb, pre_mixed);
-			sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
-			sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
-			sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
-		}
-		double cur_rgb[3] = { rgb[0][i],  rgb[1][i],  rgb[2][i] };
-		double cur_mixed[3];
-		OpsinAbsorbance(cur_rgb, cur_mixed);
-		cur_mixed[0] *= sensitivity[0];
-		cur_mixed[1] *= sensitivity[1];
-		cur_mixed[2] *= sensitivity[2];
-		double x, y, z;
-		RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
-		rgb[0][i] = static_cast<float>(x);
-		rgb[1][i] = static_cast<float>(y);
-		rgb[2][i] = static_cast<float>(z);
-*/
+	ocl_args_d_t &ocl = getOcl();
+	cl_int clSize = size;
+	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&rgb.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&rgb_blurred.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&rgb_blurred.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize);
+
+	size_t globalWorkSize[1] = { clSize };
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 // strong todo
@@ -352,21 +351,26 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
     ocl_channels rgb = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size);
 
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, rgb.r, rgb_blurred.r, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, rgb.g, rgb_blurred.g, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, rgb.b, rgb_blurred.b, 0, 0, channel_size, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0);
 	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0);
 	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0);
 
-	clOpsinDynamicsImageEx(rgb, xsize * ysize);
+	clOpsinDynamicsImageEx(rgb, rgb_blurred, xsize * ysize);
 
 	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+
 	err = clFinish(ocl.commandQueue);
 
 	memcpy(r, result_r, channel_size);
@@ -374,6 +378,7 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	memcpy(b, result_b, channel_size);
 
     ocl.releaseMemChannels(rgb);
+	ocl.releaseMemChannels(rgb_blurred);
 }
 
 // ian todo
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 26c68f5c..5387a454 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -526,8 +526,7 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
 
 	// Query for all available OpenCL platforms on the system
 	// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
-	deviceType = CL_DEVICE_TYPE_GPU;
-	cl_platform_id platformId = FindOpenCLPlatform("", deviceType);
+	cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
 	if (NULL == platformId)
 	{
 		deviceType = CL_DEVICE_TYPE_CPU;
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 5eb19560..e5cf3d7c 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -43,13 +43,15 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 * and make passing all these arguments in functions easier.
 */
 
-#define KERNEL_MINSQUAREVAL 0
-#define KERNEL_CONVOLUTION 1
-#define KERNEL_CONVOLUTIONX 2
-#define KERNEL_CONVOLUTIONY 3
-#define KERNEL_DOWNSAMPLE 4
-
-#define KERNEL_COUNT 5
+enum KernelName {
+	KERNEL_MINSQUAREVAL,
+	KERNEL_CONVOLUTION,
+	KERNEL_CONVOLUTIONX,
+	KERNEL_CONVOLUTIONY,
+	KERNEL_DOWNSAMPLE,
+	KERNEL_OPSINDYNAMICSIMAGE,
+	KERNEL_COUNT,
+};
 
 typedef union ocl_channels_t
 {
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index fb895b34..98451cca 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -928,6 +928,24 @@ inline void ClenshawRecursion<0>(const double x, const double *coefficients,
   *b1 = x_b1 - (*b2) + coefficients[0];
 }
 
+void ClenshawRecursion_fun(const double x, const double *coefficients,
+	double *b1, double *b2, int n)
+{
+	if (n == 0) {
+		const double x_b1 = x * (*b1);
+		// The final iteration differs - no 2 * x_b1 here.
+		*b1 = x_b1 - (*b2) + coefficients[0];
+		return;
+	}
+
+	const double x_b1 = x * (*b1);
+	const double t = (x_b1 + x_b1) - (*b2) + coefficients[n];
+	*b2 = *b1;
+	*b1 = t;
+
+	ClenshawRecursion_fun(x, coefficients, b1, b2, n - 1);
+}
+
 // Rational polynomial := dividing two polynomial evaluations. These are easier
 // to find than minimax polynomials.
 struct RationalPolynomial {
@@ -936,10 +954,34 @@ struct RationalPolynomial {
                                    const double (&coefficients)[N]) {
     double b1 = 0.0;
     double b2 = 0.0;
+
     ClenshawRecursion<N - 1>(x, coefficients, &b1, &b2);
+
     return b1;
   }
 
+#ifdef ENABLE_OPENCL_CHECK
+  static double EvaluatePolynomialNonRecursion(const double x, const double *coefficients, int n) {
+	double b1 = 0.0;
+	double b2 = 0.0;
+
+	for (int i = n - 1; i >= 0; i--)
+	{
+		if (i == 0) {
+			const double x_b1 = x * b1;
+			b1 = x_b1 - b2 + coefficients[0];
+			break;
+		}
+		const double x_b1 = x * b1;
+		const double t = (x_b1 + x_b1) - b2 + coefficients[i];
+		b2 = b1;
+		b1 = t;
+	}
+
+	return b1;
+  }
+#endif // ENABLE_OPENCL_CHECK
+
   // Evaluates the polynomial at x (in [min_value, max_value]).
   inline double operator()(const float x) const {
     // First normalize to [0, 1].
@@ -978,6 +1020,32 @@ static inline float GammaPolynomial(float value) {
   return static_cast<float>(r(value));
 }
 
+#ifdef ENABLE_OPENCL_CHECK
+static double GammaNonRecursion(double v) {
+	double min_value = 0.770000000000000;
+	double max_value = 274.579999999999984;
+
+	double p[5 + 1] = {
+		881.979476556478289, 1496.058452015812463, 908.662212739659481,
+		373.566100223287378, 85.840860336314364, 6.683258861509244,
+	};
+	double q[5 + 1] = {
+		12.262350348616792, 20.557285797683576, 12.161463238367844,
+		4.711532733641639, 0.899112889751053, 0.035662329617191,
+	};
+
+	// First normalize to [0, 1].
+	const double x01 = (v - min_value) / (max_value - min_value);
+	// And then to [-1, 1] domain of Chebyshev polynomials.
+	const double xc = 2.0 * x01 - 1.0;
+
+	const double yp = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, p, 6);
+	const double yq = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, q, 6);
+	if (yq == 0.0) return 0.0;
+	return static_cast<float>(yp / yq);
+}
+#endif // ENABLE_OPENCL_CHECK
+
 static inline double Gamma(double v) {
   // return SimpleGamma(v);
   return GammaPolynomial(static_cast<float>(v));
@@ -1001,6 +1069,16 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
       sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
       sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
       sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+
+#ifdef ENABLE_OPENCL_CHECK
+	  double sensitivity_new[3];
+	  sensitivity_new[0] = GammaNonRecursion(pre_mixed[0]) / pre_mixed[0];
+	  assert(fabs(sensitivity[0] - sensitivity_new[0]) < 0.01);
+	  sensitivity_new[1] = GammaNonRecursion(pre_mixed[1]) / pre_mixed[1];
+	  assert(fabs(sensitivity[1] - sensitivity_new[1]) < 0.01);
+	  sensitivity_new[2] = GammaNonRecursion(pre_mixed[2]) / pre_mixed[2];
+	  assert(fabs(sensitivity[2] - sensitivity_new[2]) < 0.01);
+#endif // ENABLE_OPENCL_CHECK
     }
     double cur_rgb[3] = { rgb[0][i],  rgb[1][i],  rgb[2][i] };
     double cur_mixed[3];

From ac4254e6f1692adedff283051b164b355c6f735a Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 4 May 2017 20:25:12 +0800
Subject: [PATCH 017/189] fix opencl compile error

---
 clguetzli/clguetzli.cl  | 25 ++++++++-----------------
 clguetzli/clguetzli.cpp | 11 ++++-------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 68f7eff0..bcf367b7 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,11 +1,3 @@
-float minfun(float a, float b)
-{
-	if (a < b)
-		return a;
-	else
-		return b;
-}
-
 __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
 {
 	const int x = get_global_id(0);
@@ -25,9 +17,8 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 	{
 		for (int i = minW; i < maxW; i++)
 		{
-			minValue = minfun(minValue, pA[j * width + i]);
-//			float tmp = pA[j * width + i];
-//			if (tmp < minValue) minValue = tmp;
+			float tmp = pA[j * width + i];
+			if (tmp < minValue) minValue = tmp;
 		}
 	}
 
@@ -274,11 +265,11 @@ float Gamma(double v)
 	double min_value = 0.770000000000000;
 	double max_value = 274.579999999999984;
 
-	static const double p[5 + 1] = {
+	/*static*/ const double p[5 + 1] = {
 		881.979476556478289, 1496.058452015812463, 908.662212739659481,
 		373.566100223287378, 85.840860336314364, 6.683258861509244,
 	};
-	static const double q[5 + 1] = {
+	/*static*/ const double q[5 + 1] = {
 		12.262350348616792, 20.557285797683576, 12.161463238367844,
 		4.711532733641639, 0.899112889751053, 0.035662329617191,
 	};
@@ -294,10 +285,10 @@ float Gamma(double v)
 
 void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
 {
-	static const double a0 = 1.01611726948;
-	static const double a1 = 0.982482243696;
-	static const double a2 = 1.43571362627;
-	static const double a3 = 0.896039849412;
+	/*static*/ const double a0 = 1.01611726948;
+	/*static*/ const double a1 = 0.982482243696;
+	/*static*/ const double a2 = 1.43571362627;
+	/*static*/ const double a3 = 0.896039849412;
 	*valx = a0 * r - a1 * g;
 	*valy = a2 * r + a3 * g;
 	*valz = b;
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 6b7b2036..f3c9afce 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -356,14 +356,11 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueCopyBuffer(ocl.commandQueue, rgb.r, rgb_blurred.r, 0, 0, channel_size, 0, NULL, NULL);
-	clEnqueueCopyBuffer(ocl.commandQueue, rgb.g, rgb_blurred.g, 0, 0, channel_size, 0, NULL, NULL);
-	clEnqueueCopyBuffer(ocl.commandQueue, rgb.b, rgb_blurred.b, 0, 0, channel_size, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0);
-	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0);
-	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0);
+	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
 	clOpsinDynamicsImageEx(rgb, rgb_blurred, xsize * ysize);
 
@@ -454,7 +451,7 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
     clScaleImageEx(img, xsize * ysize, scale, img);
 }
 
-//
+// ian todo
 void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset)
 {
 

From 0afb0a361d06d1d54dedf5e8abe0668ff1e4b8f6 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 4 May 2017 21:05:17 +0800
Subject: [PATCH 018/189] open cl compiler error fix

---
 clguetzli/clguetzli.cl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index bcf367b7..bf3b22ef 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -260,16 +260,16 @@ double EvaluatePolynomial(const double x, const double *coefficients, int n)
 	return b1;
 }
 
-float Gamma(double v)
+double Gamma(double v)
 {
 	double min_value = 0.770000000000000;
 	double max_value = 274.579999999999984;
 
-	/*static*/ const double p[5 + 1] = {
+	const double p[5 + 1] = {
 		881.979476556478289, 1496.058452015812463, 908.662212739659481,
 		373.566100223287378, 85.840860336314364, 6.683258861509244,
 	};
-	/*static*/ const double q[5 + 1] = {
+	const double q[5 + 1] = {
 		12.262350348616792, 20.557285797683576, 12.161463238367844,
 		4.711532733641639, 0.899112889751053, 0.035662329617191,
 	};
@@ -285,10 +285,10 @@ float Gamma(double v)
 
 void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
 {
-	/*static*/ const double a0 = 1.01611726948;
-	/*static*/ const double a1 = 0.982482243696;
-	/*static*/ const double a2 = 1.43571362627;
-	/*static*/ const double a3 = 0.896039849412;
+	const double a0 = 1.01611726948;
+	const double a1 = 0.982482243696;
+	const double a2 = 1.43571362627;
+	const double a3 = 0.896039849412;
 	*valx = a0 * r - a1 * g;
 	*valy = a2 * r + a3 * g;
 	*valz = b;

From 2cbc518b60a38830be6770a8ebc9f7a04a93f37c Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 4 May 2017 22:48:50 +0800
Subject: [PATCH 019/189] Implement clConvolutionEx

---
 clguetzli/clguetzli.cpp | 49 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index f3c9afce..bf92f5a4 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -243,14 +243,51 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor
 		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
 	}
 }
-//=========================================================
-// ian todo
-void clConvolutionEx(cl_mem image, size_t xsize, size_t ysize,
-				     cl_mem expn, size_t expn_size,
-                     int step, int offset, double border_ratio,
+
+void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
+				     cl_mem multipliers, size_t len,
+                     int xstep, int offset, double border_ratio,
                      cl_mem result/*out*/)
 {
-	// Convolution
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	size_t oxsize = xsize / xstep;
+
+	ocl.allocA(sizeof(cl_float) * len);
+	ocl.allocB(sizeof(cl_float) * xsize * ysize);
+	ocl.allocC(sizeof(cl_float) * oxsize * ysize);
+
+	memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len);
+	memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize);
+
+	cl_int clxsize = xsize;
+	cl_int clxstep = xstep;
+	cl_int cllen = len;
+	cl_int cloffset = offset;
+	cl_float clborder_ratio = border_ratio;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
+
+	size_t globalWorkSize[2] = { oxsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 // ian todo

From fad11fc21f50a4bcfbab72e08e3a9ff79b5cada7 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 4 May 2017 22:50:55 +0800
Subject: [PATCH 020/189] Remove useless code

---
 clguetzli/clguetzli.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index bf92f5a4..92a8558a 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -254,13 +254,6 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 
 	size_t oxsize = xsize / xstep;
 
-	ocl.allocA(sizeof(cl_float) * len);
-	ocl.allocB(sizeof(cl_float) * xsize * ysize);
-	ocl.allocC(sizeof(cl_float) * oxsize * ysize);
-
-	memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len);
-	memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize);
-
 	cl_int clxsize = xsize;
 	cl_int clxstep = xstep;
 	cl_int cllen = len;

From b5013933a421e5bf520783aee1cf445ffeb33309 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 4 May 2017 23:16:41 +0800
Subject: [PATCH 021/189] Implement clUpsampleEx

---
 clguetzli/clguetzli.cl  |  8 ++++----
 clguetzli/clguetzli.cpp | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index bf3b22ef..1f1ff8e2 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -201,7 +201,7 @@ __kernel void ConvolutionY(__global float* multipliers, __global float* inp, __g
 	result[y * xsize + x] = sum * scale;
 }
 
-__kernel void DownSample(__global float* pA, __global float* pC, int square)
+__kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
@@ -209,10 +209,10 @@ __kernel void DownSample(__global float* pA, __global float* pC, int square)
 	const int xsize = get_global_size(0);
 	const int ysize = get_global_size(1);
 
-	const int oxsize = xsize / square;
+	const int oxsize = xsize / xstep;
 
-	const int sample_x = x / square;
-	const int sample_y = y / square;
+	const int sample_x = x / xstep;
+	const int sample_y = y / ystep;
 
 	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
 }
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 92a8558a..9446886f 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -232,6 +232,7 @@ void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double bor
 		clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB);
 		clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
 		clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clstep);
+		clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clstep);
 
 		globalWorkSize[0] = ysize;
 		globalWorkSize[1] = xsize;
@@ -274,29 +275,41 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
 	err = clFinish(ocl.commandQueue);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
 }
 
-// ian todo
 void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
                   size_t xstep, size_t ystep,
                   cl_mem result/*out*/)
 {
-/*
-	for (size_t y = 0; y < ysize; y++) {
-		for (size_t x = 0; x < xsize; x++) {
-			// TODO: Use correct rounding.
-			channel[y * xsize + x] =
-				downsampled_output[(y / ystep) * dxsize + (x / xstep)];
-		}
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxstep = xstep;
+	cl_int clystep = ystep;
+	cl_kernel kernel = ocl.kernel[KERNEL_DOWNSAMPLE];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
+
+	size_t globalWorkSize[2] = { ysize, xsize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
-*/
 }
 
 void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
@@ -362,12 +375,12 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clOpsinDynamicsImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
 	err = clFinish(ocl.commandQueue);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clOpsinDynamicsImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
 }
 

From 8909cdacb2fd6ab6690fe80ad734558f0fc4f68c Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 4 May 2017 23:52:23 +0800
Subject: [PATCH 022/189] Implement clMinSquareValEx

---
 clguetzli/clguetzli.cpp | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 9446886f..a0fd943b 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -357,7 +357,6 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 	clReleaseMemObject(mem_expn);
 }
 
-// ian todo
 void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred, size_t size)
 {
 	ocl_args_d_t &ocl = getOcl();
@@ -494,10 +493,38 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
     clScaleImageEx(img, xsize * ysize, scale, img);
 }
 
-// ian todo
 void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
 
+	cl_int cloffset = offset;
+	cl_int clsquare_size = square_size;
+	ocl.allocA(sizeof(cl_float) * xsize * ysize);
+
+	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clMinSquareValEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.srcA, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clMinSquareValEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clMinSquareValEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 static const double kInternalGoodQualityThreshold = 14.921561160295326;

From 5ea138c058b6008ef209da8ed7de374d825f55af Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Fri, 5 May 2017 00:48:29 +0800
Subject: [PATCH 023/189] Implement clMaskEx

---
 clguetzli/clguetzli.cl  | 125 +++++++++++++++++++++++++++++++++++++++-
 clguetzli/clguetzli.cpp |  66 +++++++++++----------
 clguetzli/ocl.h         |   1 +
 3 files changed, 161 insertions(+), 31 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 1f1ff8e2..73e06b62 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -316,4 +316,127 @@ __kernel void OpsinDynamicsImage(__global float *r, __global float *g, __global
     r[i] = x;
     g[i] = y;
     b[i] = z;
-}
\ No newline at end of file
+}
+
+
+double InterpolateClampNegative(const double *array,
+	int size, double sx) {
+	if (sx < 0) {
+		sx = 0;
+	}
+	double ix = fabs(sx);
+	int baseix = (int)(ix);
+	double res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		double mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	return res;
+}
+
+void MakeMask(double extmul, double extoff,
+	double mul, double offset,
+	double scaler, double *result)
+{
+	for (size_t i = 0; i < 512; ++i) {
+		const double c = mul / ((0.01 * scaler * i) + offset);
+		result[i] = 1.0 + extmul * (c + extoff);
+		result[i] *= result[i];
+	}
+}
+
+double MaskX(double delta) {
+	const double extmul = 0.975741017749;
+	const double extoff = -4.25328244168;
+	const double offset = 0.454909521427;
+	const double scaler = 0.0738288224836;
+	const double mul = 20.8029176447;
+	double lut[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut);
+	return InterpolateClampNegative(lut, 512, delta);
+}
+
+double MaskY(double delta) {
+	const double extmul = 0.373995618954;
+	const double extoff = 1.5307267433;
+	const double offset = 0.911952641929;
+	const double scaler = 1.1731667845;
+	const double mul = 16.2447033988;
+	double lut[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut);
+	return InterpolateClampNegative(lut, 512, delta);
+}
+
+double MaskB(double delta) {
+	const double extmul = 0.61582234137;
+	const double extoff = -4.25376118646;
+	const double offset = 1.05105070921;
+	const double scaler = 0.47434643535;
+	const double mul = 31.1444967089;
+	double lut[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut);
+	return InterpolateClampNegative(lut, 512, delta);
+}
+
+double MaskDcX(double delta) {
+	const double extmul = 1.79116943438;
+	const double extoff = -3.86797479189;
+	const double offset = 0.670960225853;
+	const double scaler = 0.486575865525;
+	const double mul = 20.4563479139;
+	double lut[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut);
+	return InterpolateClampNegative(lut, 512, delta);
+}
+
+double MaskDcY(double delta) {
+	const double extmul = 0.212223514236;
+	const double extoff = -3.65647120524;
+	const double offset = 1.73396799447;
+	const double scaler = 0.170392660501;
+	const double mul = 21.6566724788;
+	double lut[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut);
+	return InterpolateClampNegative(lut, 512, delta);
+}
+
+double MaskDcB(double delta) {
+	const double extmul = 0.349376011816;
+	const double extoff = -0.894711072781;
+	const double offset = 0.901647926679;
+	const double scaler = 0.380086095024;
+	const double mul = 18.0373825149;
+	double lut[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut);
+	return InterpolateClampNegative(lut, 512, delta);
+}
+
+__kernel void DoMask(__global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, int xsize, int ysize)
+{
+	const double w00 = 232.206464018;
+	const double w11 = 22.9455222245;
+	const double w22 = 503.962310606;
+
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	const size_t idx = y * xsize + x;
+	const double s0 = mask_x[idx];
+	const double s1 = mask_y[idx];
+	const double s2 = mask_b[idx];
+	const double p0 = w00 * s0;
+	const double p1 = w11 * s1;
+	const double p2 = w22 * s2;
+
+	mask_x[idx] = (float)(MaskX(p0));
+	mask_y[idx] = (float)(MaskY(p1));
+	mask_b[idx] = (float)(MaskB(p2));
+	mask_dc_x[idx] = (float)(MaskDcX(p0));
+	mask_dc_y[idx] = (float)(MaskDcY(p1));
+	mask_dc_b[idx] = (float)(MaskDcB(p2));
+
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index a0fd943b..7bd75e8e 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -47,6 +47,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
 	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
 	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
+	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
 
 	return ocl;
 }
@@ -530,7 +531,37 @@ void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t s
 static const double kInternalGoodQualityThreshold = 14.921561160295326;
 static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
 
-// ian todo
+void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxsize = xsize;
+	cl_int clysize = ysize;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_DOMASK];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clysize);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clDoMask() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clDoMask() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
 void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
 	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/)
@@ -549,34 +580,9 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 
         clBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
     }
-/*
-    static const double w00 = 232.206464018;
-    static const double w11 = 22.9455222245;
-    static const double w22 = 503.962310606;
 
-    mask_dc->resize(3);
-    for (int i = 0; i < 3; ++i) {
-        (*mask_dc)[i].resize(xsize * ysize);
-    }
-    for (size_t y = 0; y < ysize; ++y) {
-        for (size_t x = 0; x < xsize; ++x) {
-            const size_t idx = y * xsize + x;
-            const double s0 = (*mask)[0][idx];
-            const double s1 = (*mask)[1][idx];
-            const double s2 = (*mask)[2][idx];
-            const double p0 = w00 * s0;
-            const double p1 = w11 * s1;
-            const double p2 = w22 * s2;
-
-            (*mask)[0][idx] = static_cast<float>(MaskX(p0));
-            (*mask)[1][idx] = static_cast<float>(MaskY(p1));
-            (*mask)[2][idx] = static_cast<float>(MaskB(p2));
-            (*mask_dc)[0][idx] = static_cast<float>(MaskDcX(p0));
-            (*mask_dc)[1][idx] = static_cast<float>(MaskDcY(p1));
-            (*mask_dc)[2][idx] = static_cast<float>(MaskDcB(p2));
-        }
-    }
-*/
+	clDoMask(mask, mask_dc, xsize, ysize);
+
     for (int i = 0; i < 3; i++)
     {
         clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask.ch[i]);
@@ -673,8 +679,8 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 	cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize);
 	cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize);
 
-	ocl_channels mask;
-	ocl_channels mask_dc;
+	ocl_channels mask = ocl.allocMemChannels(channel_size);
+	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
 	cl_mem mem_result;
 
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index e5cf3d7c..6cb0a916 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -50,6 +50,7 @@ enum KernelName {
 	KERNEL_CONVOLUTIONY,
 	KERNEL_DOWNSAMPLE,
 	KERNEL_OPSINDYNAMICSIMAGE,
+	KERNEL_DOMASK,
 	KERNEL_COUNT,
 };
 

From 437fa09e6b58fafaf244b94f34742cadc41d0e9b Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Fri, 5 May 2017 01:03:47 +0800
Subject: [PATCH 024/189] Implement clScaleImageEx

---
 clguetzli/clguetzli.cl  |  6 ++++++
 clguetzli/clguetzli.cpp | 29 ++++++++++++++++++++++++++++-
 clguetzli/ocl.h         |  1 +
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 73e06b62..4c70b380 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -440,3 +440,9 @@ __kernel void DoMask(__global float *mask_x, __global float *mask_y, __global fl
 	mask_dc_b[idx] = (float)(MaskDcB(p2));
 
 }
+
+__kernel void ScaleImage(double scale, __global float *result)
+{
+	const int i = get_global_id(0);
+	result[i] *= (float)(scale);
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 7bd75e8e..67fc7503 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -48,6 +48,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
 	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
 	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
+	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err);
 
 	return ocl;
 }
@@ -473,10 +474,36 @@ void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_
 
 }
 
-// ian todo
 void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clsize = size;
+	cl_float clscale = w;
+
+
+	err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, clsize, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
+	}
 
+	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
+	clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&clscale);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result);
+
+	size_t globalWorkSize[1] = { clsize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 // ian todo
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 6cb0a916..53fac1c8 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -51,6 +51,7 @@ enum KernelName {
 	KERNEL_DOWNSAMPLE,
 	KERNEL_OPSINDYNAMICSIMAGE,
 	KERNEL_DOMASK,
+	KERNEL_SCALEIMAGE,
 	KERNEL_COUNT,
 };
 

From a31adf1d993beb5c3dce93eecd7b7766c77c1876 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 5 May 2017 01:40:35 +0800
Subject: [PATCH 025/189] =?UTF-8?q?=E9=AA=8C=E8=AF=81clOpinDynamicImage?=
 =?UTF-8?q?=E7=9A=84=E6=95=88=E6=9E=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp                       |  2 +-
 clguetzli/clguetzli.h                         |  2 ++
 .../butteraugli/butteraugli/butteraugli.cc    | 22 ++++++++++++++++---
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 67fc7503..24802e59 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -363,7 +363,7 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred
 {
 	ocl_args_d_t &ocl = getOcl();
 	cl_int clSize = size;
-	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g);
 	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&rgb.b);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index edaa0688..ca8b0b32 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -17,3 +17,5 @@ void clConvolution(size_t xsize, size_t ysize,
 	float* result);
 
 void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio);
+
+void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
\ No newline at end of file
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 98451cca..d0c59129 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -123,6 +123,7 @@ static void Convolution(size_t xsize, size_t ysize,
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
 
+/*
 #if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
 	if (g_useOpenCL && xsize > 100 && ysize > 100)
 	{
@@ -134,7 +135,7 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
 	std::vector<float> tmpChannel(xsize  * ysize);
 	memcpy(tmpChannel.data(), channel, xsize * ysize * sizeof(float));
 #endif
-
+*/
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
   const double scaler = -1.0 / (2 * sigma * sigma);
@@ -171,6 +172,7 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
     }
   }
 
+  /*
 #ifdef ENABLE_OPENCL_CHECK
   // for verify
   {
@@ -187,6 +189,7 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
 	  }
   }
 #endif // ENABLE_OPENCL_CHECK
+*/
 }
 
 // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
@@ -1053,6 +1056,17 @@ static inline double Gamma(double v) {
 
 void OpsinDynamicsImage(size_t xsize, size_t ysize,
                         std::vector<std::vector<float> > &rgb) {
+
+    if (g_useOpenCL && xsize > 100 && ysize > 100)
+    {
+        float * r = rgb[0].data();
+        float * g = rgb[1].data();
+        float * b = rgb[2].data();
+
+        clOpsinDynamicsImage(xsize, ysize, r, g, b);
+        return;
+    }
+
   PROFILER_FUNC;
   std::vector<std::vector<float> > blurred = rgb;
   static const double kSigma = 1.1;
@@ -1448,6 +1462,7 @@ double MaskDcB(double delta) {
 void MinSquareVal(size_t square_size, size_t offset,
                   size_t xsize, size_t ysize,
                   float *values) {
+/*
 #if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
 	if (g_useOpenCL)
 	{
@@ -1461,7 +1476,7 @@ void MinSquareVal(size_t square_size, size_t offset,
 #ifdef ENABLE_OPENCL_CHECK
   std::vector<float> backup(values, values + xsize * ysize);
 #endif
-
+*/
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
@@ -1502,7 +1517,7 @@ void MinSquareVal(size_t square_size, size_t offset,
         *pValuePoint = min; pValuePoint += xsize;
     }
   }
-
+/*
 #ifdef ENABLE_OPENCL_CHECK
   clMinSquareVal(square_size, offset, xsize, ysize, backup.data());
   for (int i = 0; i < xsize * ysize; i++)
@@ -1513,6 +1528,7 @@ void MinSquareVal(size_t square_size, size_t offset,
 	  }
   }
 #endif
+*/
 }
 
 // ===== Functions used by Mask only =====

From c30b44db45a5c3c005ea6822dc49c4c5d03cef3e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 5 May 2017 09:23:45 +0800
Subject: [PATCH 026/189] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E5=8F=8C=E7=B2=BE?=
 =?UTF-8?q?=E5=BA=A6=E8=BF=90=E7=AE=97=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  | 12 ++++++++++--
 clguetzli/clguetzli.cpp | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 4c70b380..9deee460 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,3 +1,11 @@
+//#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+//#elif defined(cl_amd_fp64)
+//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+//#else
+//#error "Double precision floating point not supported by OpenCL implementation."
+//#endif
+
 __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
 {
 	const int x = get_global_id(0);
@@ -211,8 +219,8 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int
 
 	const int oxsize = xsize / xstep;
 
-	const int sample_x = x / xstep;
-	const int sample_y = y / ystep;
+	const int sample_x = x / xstep * xstep;
+	const int sample_y = y / ystep * ystep;
 
 	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
 }
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 24802e59..2ca1ceb5 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void)
 
 	char* source = nullptr;
 	size_t src_size = 0;
-	ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size);
+	ReadSourceFromFile("clguetzli.cl", &source, &src_size);
 
 	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
 
@@ -32,6 +32,17 @@ ocl_args_d_t& getOcl(void)
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
+
+        if (err == CL_BUILD_PROGRAM_FAILURE)
+        {
+            size_t log_size = 0;
+            clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+
+            std::vector<char> build_log(log_size);
+            clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
+
+            LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
+        }
 	}
 	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err);
 	if (CL_SUCCESS != err)

From 2e4cf390dc32269ba810877750671d8f4b4314fe Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 5 May 2017 12:00:33 +0800
Subject: [PATCH 027/189] Print More DeviceInfo

---
 clguetzli/ocl.cpp                             | 28 ++++--
 .../butteraugli/butteraugli/butteraugli.cc    | 94 +------------------
 2 files changed, 22 insertions(+), 100 deletions(-)

diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 5387a454..05f5470f 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -382,12 +382,18 @@ cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type
 		bool match = true;
 		cl_uint numDevices = 0;
 
-		// If the preferredPlatform is not NULL then check if platforms[i] is the required one
-		// Otherwise, continue the check with platforms[i]
+		size_t nameLen = 0;
+		clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &nameLen);
+
+		std::vector<char> platformName(nameLen + 1);
+		clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, nameLen, &platformName[0], NULL);
+		platformName[nameLen] = 0;
+
+		LogError("DeviceName: %s\n", platformName.data());
+
 		if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
 		{
-			// In case we're looking for a specific platform
-			match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform);
+			match = (strstr(&platformName[0], preferredPlatform) != 0);
 		}
 
 		// match is true if the platform's name is the required one or don't care (NULL)
@@ -400,12 +406,20 @@ cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type
 			err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
 			if (CL_SUCCESS != err)
 			{
-				LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err));
+				if (CL_DEVICE_TYPE_GPU == deviceType)
+				{
+					LogError("%s try GPU returned %s.\n", platformName.data(), TranslateOpenCLError(err));
+				}
+				if (CL_DEVICE_TYPE_CPU == deviceType)
+				{
+					LogError("%s try CPU returned %s.\n", platformName.data(), TranslateOpenCLError(err));
+				}
 			}
 
 			if (0 != numDevices)
 			{
 				// There is at list one device that answer the requirements
+				LogError("SelectDevice: %s GPU=%d\n", platformName.data(), deviceType == CL_DEVICE_TYPE_GPU ? 1 : 0);
 				return platforms[i];
 			}
 		}
@@ -526,11 +540,11 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
 
 	// Query for all available OpenCL platforms on the system
 	// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
-	cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
+	cl_platform_id platformId = FindOpenCLPlatform(nullptr, deviceType);
 	if (NULL == platformId)
 	{
 		deviceType = CL_DEVICE_TYPE_CPU;
-		platformId = FindOpenCLPlatform("", deviceType);
+		platformId = FindOpenCLPlatform(nullptr, deviceType);
 	}
 
 	if (NULL == platformId)
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index d0c59129..0753d713 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -68,16 +68,6 @@ static void Convolution(size_t xsize, size_t ysize,
 	const float* __restrict__ inp,
 	float border_ratio,
 	float* __restrict__ result) {
-/*
-#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
-	if (g_useOpenCL && xsize > 100 && ysize > 100)
-	{
-		clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
-		return;
-	}
-#endif // ENABLE_OPENCL
-*/
-
   PROFILER_FUNC;
   float weight_no_border = 0;
 
@@ -102,40 +92,11 @@ static void Convolution(size_t xsize, size_t ysize,
       result[ox * ysize + y] = static_cast<float>(sum * scale);
     }
   }
-
-  /*
-#ifdef ENABLE_OPENCL_CHECK
-  // for verify
-  std::vector<float> tmp(xsize / xstep * ysize);
-  clConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, &tmp[0]);
-
-  for (int i = 0; i < xsize / xstep * ysize; i++)
-  {
-	  if (fabs(result[i] - tmp[i]) > 0.0001)
-	  {
-		  assert(false);
-	  }
-  }
-#endif // ENABLE_OPENCL_CHECK
-*/
 }
 
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
 
-/*
-#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
-	if (g_useOpenCL && xsize > 100 && ysize > 100)
-	{
-		clBlur(xsize, ysize, channel, sigma, border_ratio);
-		return;
-	}
-#endif // ENABLE_OPENCL
-#ifdef ENABLE_OPENCL_CHECK
-	std::vector<float> tmpChannel(xsize  * ysize);
-	memcpy(tmpChannel.data(), channel, xsize * ysize * sizeof(float));
-#endif
-*/
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
   const double scaler = -1.0 / (2 * sigma * sigma);
@@ -171,25 +132,6 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
       }
     }
   }
-
-  /*
-#ifdef ENABLE_OPENCL_CHECK
-  // for verify
-  {
-	  if (xsize < 100 || ysize < 100) return;
-
-	  clBlur(xsize, ysize, tmpChannel.data(), sigma, border_ratio);
-
-	  for (int i = 0; i < xsize * ysize; i++)
-	  {
-		  if (fabs(channel[i] - tmpChannel[i]) > 0.0001)
-		  {
-			  float k = channel[i] - tmpChannel[i];
-		  }
-	  }
-  }
-#endif // ENABLE_OPENCL_CHECK
-*/
 }
 
 // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
@@ -1452,31 +1394,9 @@ double MaskDcB(double delta) {
   return InterpolateClampNegative(lut.data(), lut.size(), delta);
 }
 
-// Replaces values[x + y * xsize] with the minimum of the values in the
-// square_size square with coordinates
-//   x - offset .. x + square_size - offset - 1,
-//   y - offset .. y + square_size - offset - 1.
-
-// ʵ�ʹ�����squre_sizeһֱΪ4��offsetΪ0������SIMD�ػ�
-
 void MinSquareVal(size_t square_size, size_t offset,
-                  size_t xsize, size_t ysize,
+				  size_t xsize, size_t ysize,
                   float *values) {
-/*
-#if (defined ENABLE_OPENCL) && (!defined ENABLE_OPENCL_CHECK)
-	if (g_useOpenCL)
-	{
-		clMinSquareVal(square_size, offset, xsize, ysize, values);
-		return;
-	}
-#endif // ENABLE_OPENCL
-
-  PROFILER_FUNC;
-
-#ifdef ENABLE_OPENCL_CHECK
-  std::vector<float> backup(values, values + xsize * ysize);
-#endif
-*/
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
@@ -1517,18 +1437,6 @@ void MinSquareVal(size_t square_size, size_t offset,
         *pValuePoint = min; pValuePoint += xsize;
     }
   }
-/*
-#ifdef ENABLE_OPENCL_CHECK
-  clMinSquareVal(square_size, offset, xsize, ysize, backup.data());
-  for (int i = 0; i < xsize * ysize; i++)
-  {
-	  if (fabs(backup[i] - values[i]) > 0.0001)
-	  {
-		  assert(false);
-	  }
-  }
-#endif
-*/
 }
 
 // ===== Functions used by Mask only =====

From fd520d3236334b73b0c0d17f8680db47beb415ab Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 5 May 2017 16:12:01 +0800
Subject: [PATCH 028/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

# Conflicts:
#	clguetzli/clguetzli.cl
#	clguetzli/clguetzli.cpp
---
 clguetzli/clguetzli.cl                        | 177 +++++++++++++++++-
 clguetzli/clguetzli.cpp                       | 127 ++++++++++---
 clguetzli/clguetzli.h                         |   8 +-
 clguetzli/ocl.h                               |   3 +-
 .../butteraugli/butteraugli/butteraugli.cc    |   9 +
 5 files changed, 295 insertions(+), 29 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 9deee460..3f9820dc 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -302,7 +302,10 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *
 	*valz = b;
 }
 
-__kernel void OpsinDynamicsImage(__global float *r, __global float *g, __global float *b, __global float *r_blurred, __global float *g_blurred, __global float *b_blurred, int size)
+__kernel void OpsinDynamicsImage(
+	__global float *r, __global float *g, __global float *b,
+	__global float *r_blurred, __global float *g_blurred, __global float *b_blurred,
+	int size)
 {
 	const int i = get_global_id(0);
 	double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
@@ -423,7 +426,10 @@ double MaskDcB(double delta) {
 	return InterpolateClampNegative(lut, 512, delta);
 }
 
-__kernel void DoMask(__global float *mask_x, __global float *mask_y, __global float *mask_b, __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b, int xsize, int ysize)
+__kernel void DoMask(
+	__global float *mask_x, __global float *mask_y, __global float *mask_b,
+	__global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
+	int xsize, int ysize)
 {
 	const double w00 = 232.206464018;
 	const double w11 = 22.9455222245;
@@ -454,3 +460,170 @@ __kernel void ScaleImage(double scale, __global float *result)
 	const int i = get_global_id(0);
 	result[i] *= (float)(scale);
 }
+
+double DotProduct(float u[3], double v[3]) {
+  return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
+}
+
+__kernel void CombineChannels(
+	__global float *mask_x, __global float *mask_y, __global float *mask_b,
+	__global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
+	__global float *block_diff_dc,
+	__global float *block_diff_ac,
+	__global float *edge_detector_map,
+	int xsize, int ysize,
+	int step,
+	int res_xsize,
+	__global float *result)
+{
+	const int res_x = get_global_id(0);
+	const int res_y = get_global_id(1);
+
+	if (res_x * step >= xsize - (8 - step)) return;
+	if (res_y * step >= ysize - (8 - step)) return;
+
+	double mask[3];
+	double dc_mask[3];
+	mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
+	dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)];
+
+	mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
+	dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
+
+	mask[1] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
+	dc_mask[1] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
+
+	size_t res_ix = (res_y * res_xsize + res_x) / step;
+	result[res_ix] = (float)(
+		DotProduct((float *)&block_diff_dc[3 * res_ix], dc_mask) +
+		DotProduct((float *)&block_diff_ac[3 * res_ix], mask) +
+		DotProduct((float *)&edge_detector_map[3 * res_ix], mask));
+}
+
+inline double Interpolate(const double *array, int size, double sx) {
+	double ix = fabs(sx);
+
+	int baseix = static_cast<int>(ix);
+	double res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		double mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	if (sx < 0) res = -res;
+	return res;
+}
+
+std::array<double, 21> MakeLowFreqColorDiffDy() {
+	std::array<double, 21> lut;
+	static const double inc = 5.2511644570349185;
+	lut[0] = 0.0;
+	for (int i = 1; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const double *GetLowFreqColorDiffDy() {
+	static const std::array<double, 21> kLut = MakeLowFreqColorDiffDy();
+	return kLut.data();
+}
+
+void XybLowFreqToVals(double x, double y, double z,
+	double *valx, double *valy, double *valz) {
+	static const double xmul = 6.64482198135;
+	static const double ymul = 0.837846224276;
+	static const double zmul = 7.34905756986;
+	static const double y_to_z_mul = 0.0812519812628;
+	z += y_to_z_mul * y;
+	*valz = z * zmul;
+	*valx = x * xmul;
+	*valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul);
+}
+
+void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+	double r1, double g1, double b1,
+	double factor, double res[3]) {
+	double valx0, valy0, valz0;
+	double valx1, valy1, valz1;
+	XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0);
+	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
+		PROFILER_ZONE("XybDiff r1=g1=b1=0");
+		res[0] += factor * valx0 * valx0;
+		res[1] += factor * valy0 * valy0;
+		res[2] += factor * valz0 * valz0;
+		return;
+	}
+	XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1);
+	// Approximate the distance of the colors by their respective distances
+	// to gray.
+	double valx = valx0 - valx1;
+	double valy = valy0 - valy1;
+	double valz = valz0 - valz1;
+	res[0] += factor * valx * valx;
+	res[1] += factor * valy * valy;
+	res[2] += factor * valz * valz;
+}
+
+__kernel void edgeDetectorMap(__global float *result, __global float *r, __global float *g, __global float* b, __global float *r2, __global float* g2, __global float *b2, int xsize, int ysize, int step)
+{
+	const int result_x = get_global_id(0);
+	const int result_y = get_global_id(1);
+
+	const int result_xsize = get_global_size(0);
+	const int result_ysize = get_global_size(1);
+
+	int pos_x = result_x * step;
+	int pos_y = result_y * step;
+
+	int local_count = 0;
+	double local_xyb[3] = { 0 };
+	const double w = 0.711100840192;
+
+	int offset[4][2] = { { 0��0}�� { 0��7}��{ 7��0}��{ 7��7} };
+	int edgeSize = 3;
+
+	for (int k = 0; i < 4; k++)
+	{
+		int x = pos_x + offset[k][0];
+		int y = pos_y + offset[k][1];
+
+		if (x >= edgeSize && x + edgeSize < xsize) {
+			size_t ix = y * xsize + (x - edgeSize);
+			size_t ix2 = ix + 2 * edgeSize;
+			XybDiffLowFreqSquaredAccumulate(
+				w * (r[ix] - r[ix2]),
+				w * (g[ix] - g[ix2]),
+				w * (b[ix] - b[ix2]),
+				w * (r2[ix] - r2[ix2]),
+				w * (g2[ix] - g2[ix2]),
+				w * (b2[ix] - b2[ix2]),
+				1.0, local_xyb);
+			++local_count;
+		}
+		if (y >= edgeSize && y + edgeSize < ysize) {
+			size_t ix = (y - edgeSize) * xsize + x;
+			size_t ix2 = ix + 2 * edgeSize * xsize;
+			XybDiffLowFreqSquaredAccumulate(
+				w * (r[ix] - r[ix2]),
+				w * (g[ix] - g[ix2]),
+				w * (b[ix] - b[ix2]),
+				w * (r2[ix] - r2[ix2]),
+				w * (g2[ix] - g2[ix2]),
+				w * (b2[ix] - b2[ix2]),
+				1.0, local_xyb);
+			++local_count;
+		}
+	}
+
+	static const double weight = 0.01617112696;
+	const double mul = weight * 8.0 / local_count;
+
+	int idx = (result_y * result_xsize + result_x) * 3;
+	result[idx]     = local_xyb[0];
+	result[idx + 1] = local_xyb[1];
+	result[idx + 2] = local_xyb[2];
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 2ca1ceb5..c96fb8f1 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -60,6 +60,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
 	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
 	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err);
+	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err);
 
 	return ocl;
 }
@@ -396,7 +397,6 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred
 	}
 }
 
-// strong todo
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b)
 {
 	static const double kSigma = 1.1;
@@ -442,17 +442,25 @@ void clMaskHighIntensityChangeEx(ocl_channels rgb/*in,out*/,
 }
 
 // strong todo
-void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, cl_mem result/*out*/)
+void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/)
 {
+	cl_int channel_size = xsize * ysize * sizeof(float);
+
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
 	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
-	clBlurEx(rgb.r,  xsize, ysize, kSigma[0], 0.0);
-	clBlurEx(rgb2.r, xsize, ysize, kSigma[0], 0.0);
-	clBlurEx(rgb.g,  xsize, ysize, kSigma[1], 0.0);
-	clBlurEx(rgb2.g, xsize, ysize, kSigma[1], 0.0);
-	clBlurEx(rgb.b,  xsize, ysize, kSigma[2], 0.0);
-	clBlurEx(rgb2.b, xsize, ysize, kSigma[2], 0.0);
 
+	for (int i = 0; i < 3; i++)
+	{
+		clBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
+		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+	}
 	// EdgeDetectorLowFreq
+
 }
 
 // strong todo
@@ -468,15 +476,23 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
 	cl_mem block_diff_ac/*out*/)
 {
+	cl_int channel_size = xsize * ysize * sizeof(float);
+
 	static const double kSigma = 14;
 	static const double kMul = 10;
 
-	clBlurEx(rgb.r, xsize, ysize,  kSigma, 0.0);
-	clBlurEx(rgb2.r, xsize, ysize, kSigma, 0.0);
-	clBlurEx(rgb.g, xsize, ysize,  kSigma, 0.0);
-	clBlurEx(rgb2.g, xsize, ysize, kSigma, 0.0);
-	clBlurEx(rgb.b, xsize, ysize,  kSigma, 0.0);
-	clBlurEx(rgb2.b, xsize, ysize, kSigma, 0.0);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+
+	for (int i = 0; i < 3; i++)
+	{
+		clBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
+		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
+	}
 }
 
 // ian todo
@@ -628,10 +644,52 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
     }
 }
 
-// ian todo
-void clCombineChannelsEx(ocl_channels mask, ocl_channels mask_dc, cl_mem block_diff_dc, cl_mem block_diff_ac, cl_mem edge_detector_map, size_t step, cl_mem result/*out*/)
+void clCombineChannelsEx(
+	ocl_channels mask,
+	ocl_channels mask_dc,
+	cl_mem block_diff_dc,
+	cl_mem block_diff_ac,
+	cl_mem edge_detector_map,
+	size_t xsize, size_t ysize,
+	size_t step,
+	size_t res_xsize,
+	cl_mem result/*out*/)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxsize = xsize;
+	cl_int clysize = ysize;
+	cl_int clstep = step;
+	cl_int clres_xsize = res_xsize;
 
+	cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&block_diff_dc);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&block_diff_ac);
+	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&edge_detector_map);
+	clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize);
+	clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clstep);
+	clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_xsize);
+	clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result);
+
+	size_t globalWorkSize[2] = { xsize / step, ysize /step };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCombineChannelsEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCombineChannelsEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 // strong todo
@@ -691,10 +749,10 @@ void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, i
     clScaleImageEx(result, xsize * ysize, scale, result);
 }
 
-// strong todo
-void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
+void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 								 float* r2, float* g2, float* b2,
 								 size_t xsize, size_t ysize,
+								 size_t step,
 								 float* result)
 {
 
@@ -713,24 +771,43 @@ void clDiffmapOpsinDynamicsImage(float* r, float* g, float* b,
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize);
-	cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize);
-	cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize);
+	cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize * sizeof(float));
+	cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize * sizeof(float));
+	cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize * sizeof(float));
 
 	ocl_channels mask = ocl.allocMemChannels(channel_size);
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-	cl_mem mem_result;
+	size_t res_xsize_; // ��Ա��������Ҫ����
+	size_t res_ysize_; // ��Ա��������Ҫ����
+	cl_mem mem_result = ocl.allocMem(channel_size);
 
 	clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize);
 
-	clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, edge_detector_map);
+	clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, step, edge_detector_map);
 	clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac);
 	clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac);
 
-    int step = 4;
 	clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc);
-	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, step, mem_result);
+
+	size_t xsize_ = 0, ysize_ = 0; // ��Ա��������Ҫ����
+	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize_, ysize_, step, res_xsize_, mem_result);
 
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
+
+
+	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	memcpy(result, result_r, channel_size);
+
+	ocl.releaseMemChannels(xyb);
+	ocl.releaseMemChannels(xyb2);
+
+	clReleaseMemObject(edge_detector_map);
+	clReleaseMemObject(block_diff_dc);
+	clReleaseMemObject(block_diff_ac);
+
+	ocl.releaseMemChannels(mask);
+	ocl.releaseMemChannels(mask_dc);
+
+	clReleaseMemObject(mem_result);
 }
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index ca8b0b32..6f29dd35 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -18,4 +18,10 @@ void clConvolution(size_t xsize, size_t ysize,
 
 void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio);
 
-void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
\ No newline at end of file
+void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
+
+void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
+	float* r2, float* g2, float* b2,
+	size_t xsize, size_t ysize,
+	size_t step,
+	float* result);
\ No newline at end of file
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 53fac1c8..d0370bb3 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -44,7 +44,7 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 */
 
 enum KernelName {
-	KERNEL_MINSQUAREVAL,
+	KERNEL_MINSQUAREVAL = 0,
 	KERNEL_CONVOLUTION,
 	KERNEL_CONVOLUTIONX,
 	KERNEL_CONVOLUTIONY,
@@ -52,6 +52,7 @@ enum KernelName {
 	KERNEL_OPSINDYNAMICSIMAGE,
 	KERNEL_DOMASK,
 	KERNEL_SCALEIMAGE,
+	KERNEL_COMBINECHANNELS,
 	KERNEL_COUNT,
 };
 
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 0753d713..2fd045d8 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1120,6 +1120,15 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
     const std::vector<std::vector<float>> &xyb0_arg,
     std::vector<std::vector<float>> &xyb1,
     std::vector<float> &result) {
+/*
+	if (g_useOpenCL && xsize_ > 100 && ysize_ > 100)
+	{
+		result.resize(xsize_ * ysize_);
+		clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(),
+			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, result.data());
+	}
+*/
+
   if (xsize_ < 8 || ysize_ < 8) return;
   auto xyb0 = xyb0_arg;
   {

From 56ac179849f055000c43b78dc7d98283f948d0dd Mon Sep 17 00:00:00 2001
From: ianuming <uming.zelda@gmail.com>
Date: Fri, 5 May 2017 20:01:05 +0800
Subject: [PATCH 029/189] Implement clMaskHighIntensityChangeEx

---
 clguetzli/clguetzli.cl  |  71 ++++++++++++++++-
 clguetzli/clguetzli.cpp | 173 ++++++++++++++++++++++++++++++----------
 clguetzli/ocl.h         |   1 +
 3 files changed, 200 insertions(+), 45 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 3f9820dc..91f05490 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -503,7 +503,7 @@ __kernel void CombineChannels(
 inline double Interpolate(const double *array, int size, double sx) {
 	double ix = fabs(sx);
 
-	int baseix = static_cast<int>(ix);
+	int baseix = (int)(ix);
 	double res;
 	if (baseix >= size - 1) {
 		res = array[size - 1];
@@ -517,6 +517,7 @@ inline double Interpolate(const double *array, int size, double sx) {
 	return res;
 }
 
+/*
 std::array<double, 21> MakeLowFreqColorDiffDy() {
 	std::array<double, 21> lut;
 	static const double inc = 5.2511644570349185;
@@ -531,6 +532,7 @@ const double *GetLowFreqColorDiffDy() {
 	static const std::array<double, 21> kLut = MakeLowFreqColorDiffDy();
 	return kLut.data();
 }
+*/
 
 void XybLowFreqToVals(double x, double y, double z,
 	double *valx, double *valy, double *valz) {
@@ -541,7 +543,7 @@ void XybLowFreqToVals(double x, double y, double z,
 	z += y_to_z_mul * y;
 	*valz = z * zmul;
 	*valx = x * xmul;
-	*valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul);
+	//*valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul);
 }
 
 void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
@@ -551,7 +553,7 @@ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
 	double valx1, valy1, valz1;
 	XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0);
 	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
-		PROFILER_ZONE("XybDiff r1=g1=b1=0");
+		//PROFILER_ZONE("XybDiff r1=g1=b1=0");
 		res[0] += factor * valx0 * valx0;
 		res[1] += factor * valy0 * valy0;
 		res[2] += factor * valz0 * valz0;
@@ -583,9 +585,10 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa
 	double local_xyb[3] = { 0 };
 	const double w = 0.711100840192;
 
-	int offset[4][2] = { { 0��0}�� { 0��7}��{ 7��0}��{ 7��7} };
+	//int offset[4][2] = { { 0��0}�� { 0��7}��{ 7��0}��{ 7��7} };
 	int edgeSize = 3;
 
+	/*
 	for (int k = 0; i < 4; k++)
 	{
 		int x = pos_x + offset[k][0];
@@ -618,6 +621,7 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa
 			++local_count;
 		}
 	}
+	*/
 
 	static const double weight = 0.01617112696;
 	const double mul = weight * 8.0 / local_count;
@@ -627,3 +631,62 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa
 	result[idx + 1] = local_xyb[1];
 	result[idx + 2] = local_xyb[2];
 }
+
+__kernel void MaskHighIntensityChange(
+	__global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+	__global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+	__global float *c0_x, __global float *c0_y, __global float *c0_b,
+	__global float *c1_x, __global float *c1_y, __global float *c1_b
+	)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	size_t ix = y * xsize + x;
+	const double ave[3] = {
+	(c0_x[ix] + c1_x[ix]) * 0.5,
+	(c0_y[ix] + c1_y[ix]) * 0.5,
+	(c0_b[ix] + c1_b[ix]) * 0.5,
+	};
+	double sqr_max_diff = -1;
+	{
+		int offset[4] =
+			{ -1, 1, -(int)(xsize), (int)(xsize) };
+		int border[4] =
+			{ x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+		for (int dir = 0; dir < 4; ++dir) {
+			if (border[dir]) {
+			continue;
+			}
+			const int ix2 = ix + offset[dir];
+			double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+			diff *= diff;
+			if (sqr_max_diff < diff) {
+			sqr_max_diff = diff;
+			}
+		}
+	}
+	const double kReductionX = 275.19165240059317;
+	const double kReductionY = 18599.41286306991;
+	const double kReductionZ = 410.8995306951065;
+	const double kChromaBalance = 106.95800948271017;
+	double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+	const double mix[3] = {
+		chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+		kReductionY / (sqr_max_diff + kReductionY),
+		chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+	};
+	// Interpolate lineraly between the average color and the actual
+	// color -- to reduce the importance of this pixel.
+	xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+	xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+
+	xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+	xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+
+	xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+	xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index c96fb8f1..55d7953a 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -61,6 +61,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
 	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err);
 	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err);
+	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err);
 
 	return ocl;
 }
@@ -433,12 +434,37 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	ocl.releaseMemChannels(rgb_blurred);
 }
 
-// ian todo
-void clMaskHighIntensityChangeEx(ocl_channels rgb/*in,out*/,
-                                 ocl_channels rgb2/*in,out*/,
+void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/,
+                                 ocl_channels xyb1/*in,out*/,
+								 ocl_channels c0,
+								 ocl_channels c1,
                                  size_t xsize, size_t ysize)
 {
-	// MaskHighIntensityChange
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0_arg.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0_arg.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0_arg.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&c0.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&c0.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&c0.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c1.r);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c1.g);
+	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c1.b);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 // strong todo
@@ -486,7 +512,7 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
 	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
 
-	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+	//static const double kSigma[3] = { 1.5, 0.586, 0.4 };
 
 	for (int i = 0; i < 3; i++)
 	{
@@ -501,7 +527,7 @@ void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_
 
 }
 
-void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/)
+void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -509,16 +535,9 @@ void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/)
 	cl_int clsize = size;
 	cl_float clscale = w;
 
-
-	err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, clsize, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clScaleImageEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
-	}
-
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
 	clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&clscale);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
 
 	size_t globalWorkSize[1] = { clsize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -536,16 +555,75 @@ void clScaleImageEx(cl_mem img, size_t size, float w, cl_mem result/*out*/)
 // ian todo
 void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
 {
-    static const float w = 0.679144890667f;
-    static const float scale = 1.0f / (5.0f + 4 * w);
+	if (xsize < 4 || ysize < 4) {
+		// TODO: Make this work for small dimensions as well.
+		return;
+	}
 
-    cl_mem tmp0;
-    cl_mem tmp1;
-    clScaleImageEx(img, xsize * ysize, w, tmp0);
-    clScaleImageEx(img, xsize * ysize, 1, tmp1);
-    // average5x5 calc
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
 
-    clScaleImageEx(img, xsize * ysize, scale, img);
+	size_t len = xsize * ysize * sizeof(float);
+	ocl.allocA(len);
+	ocl.allocB(len);
+	ocl.allocC(len);
+	cl_mem result = ocl.srcA;
+	cl_mem tmp0 = ocl.srcB;
+	cl_mem tmp1 = ocl.dstMem;
+
+	err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, len, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp0, 0, 0, len, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp1, 0, 0, len, 0, NULL, NULL);
+
+	static const float w = 0.679144890667f;
+	static const float scale = 1.0f / (5.0f + 4 * w);
+
+	clScaleImageEx(tmp1, xsize * ysize, w);
+	/* TODO
+	for (int y = 0; y < ysize; y++) {
+		const int row0 = y * xsize;
+		result[row0 + 1] += tmp0[row0];
+		result[row0 + 0] += tmp0[row0 + 1];
+		result[row0 + 2] += tmp0[row0 + 1];
+		for (int x = 2; x < xsize - 2; ++x) {
+			result[row0 + x - 1] += tmp0[row0 + x];
+			result[row0 + x + 1] += tmp0[row0 + x];
+		}
+		result[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
+		result[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
+		result[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
+		if (y > 0) {
+			const int rowd1 = row0 - xsize;
+			result[rowd1 + 1] += tmp1[row0];
+			result[rowd1 + 0] += tmp0[row0];
+			for (int x = 1; x < xsize - 1; ++x) {
+				result[rowd1 + x + 1] += tmp1[row0 + x];
+				result[rowd1 + x + 0] += tmp0[row0 + x];
+				result[rowd1 + x - 1] += tmp1[row0 + x];
+			}
+			result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+		if (y + 1 < ysize) {
+			const int rowu1 = row0 + xsize;
+			result[rowu1 + 1] += tmp1[row0];
+			result[rowu1 + 0] += tmp0[row0];
+			for (int x = 1; x < xsize - 1; ++x) {
+				result[rowu1 + x + 1] += tmp1[row0 + x];
+				result[rowu1 + x + 0] += tmp0[row0 + x];
+				result[rowu1 + x - 1] += tmp1[row0 + x];
+			}
+			result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+	}
+	*/
+	err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, len, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clAverage5x5Ex() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
+	}
+	clScaleImageEx(img, xsize * ysize, scale);
 }
 
 void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset)
@@ -639,8 +717,8 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 
     for (int i = 0; i < 3; i++)
     {
-        clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask.ch[i]);
-        clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale, mask_dc.ch[i]);
+        clScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+        clScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
     }
 }
 
@@ -746,7 +824,7 @@ void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, i
     }
     }
 */
-    clScaleImageEx(result, xsize * ysize, scale, result);
+    clScaleImageEx(result, xsize * ysize, scale);
 }
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
@@ -760,15 +838,26 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb2 = ocl.allocMemChannels(channel_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+	ocl_channels xyb0_arg = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
+
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1_c = ocl.allocMemChannels(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+
+
+	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.r, xyb0.r, 0, 0, channel_size, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.g, xyb0.g, 0, 0, channel_size, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.b, xyb0.b, 0, 0, channel_size, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, xyb1_c.r, 0, 0, channel_size, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, xyb1_c.g, 0, 0, channel_size, 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, xyb1_c.b, 0, 0, channel_size, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize * sizeof(float));
@@ -782,13 +871,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	size_t res_ysize_; // ��Ա��������Ҫ����
 	cl_mem mem_result = ocl.allocMem(channel_size);
 
-	clMaskHighIntensityChangeEx(xyb, xyb2, xsize, ysize);
+	clMaskHighIntensityChangeEx(xyb0_arg, xyb1_c, xyb0, xyb1, xsize, ysize);
 
-	clEdgeDetectorMapEx(xyb, xyb2, xsize, ysize, step, edge_detector_map);
-	clBlockDiffMapEx(xyb, xyb2, xsize, ysize, block_diff_dc, block_diff_ac);
-	clEdgeDetectorLowFreqEx(xyb, xyb2, xsize, ysize, block_diff_ac);
+	//clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, edge_detector_map);
+	clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, block_diff_dc, block_diff_ac);
+	clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, block_diff_ac);
 
-	clMaskEx(xyb, xyb2, xsize, ysize, mask, mask_dc);
+	clMaskEx(xyb0_arg, xyb1, xsize, ysize, mask, mask_dc);
 
 	size_t xsize_ = 0, ysize_ = 0; // ��Ա��������Ҫ����
 	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize_, ysize_, step, res_xsize_, mem_result);
@@ -799,8 +888,10 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	memcpy(result, result_r, channel_size);
 
-	ocl.releaseMemChannels(xyb);
-	ocl.releaseMemChannels(xyb2);
+	ocl.releaseMemChannels(xyb0_arg);
+	ocl.releaseMemChannels(xyb1);
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1_c);
 
 	clReleaseMemObject(edge_detector_map);
 	clReleaseMemObject(block_diff_dc);
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index d0370bb3..64f94de2 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -53,6 +53,7 @@ enum KernelName {
 	KERNEL_DOMASK,
 	KERNEL_SCALEIMAGE,
 	KERNEL_COMBINECHANNELS,
+	KERNEL_MASKHIGHINTENSITYCHANGE,
 	KERNEL_COUNT,
 };
 

From a024ec13719b0f6d52738e167115e15d4731b380 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Fri, 5 May 2017 22:46:54 +0800
Subject: [PATCH 030/189] Implement clDiffPrecomputeEx

---
 clguetzli/clguetzli.cl  | 89 +++++++++++++++++++++++++++++++++++++++++
 clguetzli/clguetzli.cpp | 45 ++++++++++++++++-----
 clguetzli/ocl.h         |  1 +
 3 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 91f05490..846ef013 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -690,3 +690,92 @@ __kernel void MaskHighIntensityChange(
 	xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
 	xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
+
+void XybToVals(
+	double x, double y, double z,
+	double *valx, double *valy, double *valz)
+{
+	static const double xmul = 0.758304045695;
+	static const double ymul = 2.28148649801;
+	static const double zmul = 1.87816926918;
+
+	double lut[21] = { 0.0 };
+	const double off = 11.38708334481672;
+	const double inc = 14.550189611520716;
+	lut[0] = 0.0;
+	lut[1] = off;
+	for (int i = 2; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+
+	*valx = Interpolate(lut, 21, x * xmul);
+	*valy = Interpolate(lut, 21, y * ymul);
+	*valz = zmul * z;
+}
+
+
+__kernel void DiffPrecompute(
+	__global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+	__global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+	__global float *mask_x, __global float *mask_y, __global float *mask_b )
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	double valsh0[3] = { 0.0 };
+	double valsv0[3] = { 0.0 };
+	double valsh1[3] = { 0.0 };
+	double valsv1[3] = { 0.0 };
+	int ix2;
+
+	size_t ix = x + xsize * y;
+	if (x + 1 < xsize) {
+		ix2 = ix + 1;
+	}
+	else {
+		ix2 = ix - 1;
+	}
+	{
+		double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
+		double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
+		double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
+		XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]);
+		double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
+		double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
+		double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
+		XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]);
+	}
+	if (y + 1 < ysize) {
+		ix2 = ix + xsize;
+	}
+	else {
+		ix2 = ix - xsize;
+	}
+	{
+		double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
+		double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
+		double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
+		XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]);
+		double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
+		double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
+		double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
+		XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]);
+	}
+
+	double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]);
+	double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]);
+	double m = min(sup0, sup1);
+	mask_x[ix] = (float)(m);
+
+	sup0 = fabs(valsh0[1]) + fabs(valsv0[1]);
+	sup1 = fabs(valsh1[1]) + fabs(valsv1[1]);
+	m = min(sup0, sup1);
+	mask_y[ix] = (float)(m);
+
+	sup0 = fabs(valsh0[2]) + fabs(valsv0[2]);
+	sup1 = fabs(valsh1[2]) + fabs(valsv1[2]);
+	m = min(sup0, sup1);
+	mask_b[ix] = (float)(m);
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 55d7953a..e5d90d21 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void)
 
 	char* source = nullptr;
 	size_t src_size = 0;
-	ReadSourceFromFile("clguetzli.cl", &source, &src_size);
+	ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size);
 
 	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
 
@@ -62,6 +62,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err);
 	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err);
 	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err);
+	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err);
 
 	return ocl;
 }
@@ -447,12 +448,15 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/,
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0_arg.r);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0_arg.g);
 	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0_arg.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&c0.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&c0.g);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&c0.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c1.r);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c1.g);
-	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c1.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c0.r);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c0.g);
+	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c0.b);
+	clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&c1.r);
+	clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&c1.g);
+	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&c1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -521,10 +525,33 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 	}
 }
 
-// ian todo
-void clDiffPrecomputeEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, ocl_channels mask/*out*/)
+void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.r);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.g);
+	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mask.b);
 
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
 void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 64f94de2..3e20a2f3 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -54,6 +54,7 @@ enum KernelName {
 	KERNEL_SCALEIMAGE,
 	KERNEL_COMBINECHANNELS,
 	KERNEL_MASKHIGHINTENSITYCHANGE,
+	KERNEL_DIFFPRECOMPUTE,
 	KERNEL_COUNT,
 };
 

From 13637b276899efddd257659495efb74ea389d8e6 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Fri, 5 May 2017 22:47:21 +0800
Subject: [PATCH 031/189] Implement clDiffPrecomputeEx

---
 clguetzli/clguetzli.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index e5d90d21..4ec86456 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -545,12 +545,12 @@ void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clDiffPrecomputeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
 	err = clFinish(ocl.commandQueue);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clDiffPrecomputeEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
 }
 

From b7b19edc7cbbc7d072bb1eda642269d113be43a9 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 6 May 2017 15:54:12 +0800
Subject: [PATCH 032/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

# Conflicts:
#	clguetzli/clguetzli.cl
#	clguetzli/clguetzli.cpp
#	clguetzli/ocl.h
---
 clguetzli/clguetzli.cl  | 787 ++++++++++++++++++++++++++++++++++++----
 clguetzli/clguetzli.cpp | 378 +++++++++++++------
 clguetzli/ocl.h         |   7 +
 3 files changed, 986 insertions(+), 186 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 846ef013..86ffb63a 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -33,16 +33,6 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 	pC[y * width + x] = minValue;
 }
 
-float calcWeight(__global float* multipliers, int len)
-{
-	float weight_no_border = 0;
-	for (int j = 0; j < len; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
-	return weight_no_border;
-}
-
 __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result,
 							int xsize, int xstep, int len, int offset, float border_ratio)
 {
@@ -53,14 +43,12 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 	const int ysize = get_global_size(1);
 
 	const int x = ox * xstep;
-/*
+
 	float weight_no_border = 0;
 	for (int j = 0; j <= 2 * offset; j++)
 	{
 		weight_no_border += multipliers[j];
 	}
-*/
-	float weight_no_border = calcWeight(multipliers, len);
 
 	int minx = x < offset ? 0 : x - offset;
 	int maxx = min(xsize, x + len - offset);
@@ -473,12 +461,14 @@ __kernel void CombineChannels(
 	__global float *edge_detector_map,
 	int xsize, int ysize,
 	int step,
-	int res_xsize,
 	__global float *result)
 {
 	const int res_x = get_global_id(0);
 	const int res_y = get_global_id(1);
 
+	const int res_xsize = get_global_size(0);
+	const int res_ysize = get_global_size(1);
+
 	if (res_x * step >= xsize - (8 - step)) return;
 	if (res_y * step >= ysize - (8 - step)) return;
 
@@ -500,7 +490,7 @@ __kernel void CombineChannels(
 		DotProduct((float *)&edge_detector_map[3 * res_ix], mask));
 }
 
-inline double Interpolate(const double *array, int size, double sx) {
+inline double Interpolate(__constant double *array, int size, double sx) {
 	double ix = fabs(sx);
 
 	int baseix = (int)(ix);
@@ -517,33 +507,41 @@ inline double Interpolate(const double *array, int size, double sx) {
 	return res;
 }
 
-/*
-std::array<double, 21> MakeLowFreqColorDiffDy() {
-	std::array<double, 21> lut;
-	static const double inc = 5.2511644570349185;
-	lut[0] = 0.0;
-	for (int i = 1; i < 21; ++i) {
-		lut[i] = lut[i - 1] + inc;
-	}
-	return lut;
-}
-
-const double *GetLowFreqColorDiffDy() {
-	static const std::array<double, 21> kLut = MakeLowFreqColorDiffDy();
-	return kLut.data();
-}
-*/
+__constant double XybLowFreqToVals_inc = 5.2511644570349185;
+__constant double XybLowFreqToVals_lut[21] = {
+	0,
+	1 * XybLowFreqToVals_inc,
+	2 * XybLowFreqToVals_inc,
+	3 * XybLowFreqToVals_inc,
+	4 * XybLowFreqToVals_inc,
+	5 * XybLowFreqToVals_inc,
+	6 * XybLowFreqToVals_inc,
+	7 * XybLowFreqToVals_inc,
+	8 * XybLowFreqToVals_inc,
+	9 * XybLowFreqToVals_inc,
+	10 * XybLowFreqToVals_inc,
+	11 * XybLowFreqToVals_inc,
+	12 * XybLowFreqToVals_inc,
+	13 * XybLowFreqToVals_inc,
+	14 * XybLowFreqToVals_inc,
+	15 * XybLowFreqToVals_inc,
+	16 * XybLowFreqToVals_inc,
+	17 * XybLowFreqToVals_inc,
+	18 * XybLowFreqToVals_inc,
+	19 * XybLowFreqToVals_inc,
+	20 * XybLowFreqToVals_inc,
+};
 
 void XybLowFreqToVals(double x, double y, double z,
 	double *valx, double *valy, double *valz) {
-	static const double xmul = 6.64482198135;
-	static const double ymul = 0.837846224276;
-	static const double zmul = 7.34905756986;
-	static const double y_to_z_mul = 0.0812519812628;
+	const double xmul = 6.64482198135;
+	const double ymul = 0.837846224276;
+	const double zmul = 7.34905756986;
+	const double y_to_z_mul = 0.0812519812628;
 	z += y_to_z_mul * y;
 	*valz = z * zmul;
 	*valx = x * xmul;
-	//*valy = Interpolate(GetLowFreqColorDiffDy(), 21, y * ymul);
+	*valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul);
 }
 
 void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
@@ -570,26 +568,31 @@ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
 	res[2] += factor * valz * valz;
 }
 
-__kernel void edgeDetectorMap(__global float *result, __global float *r, __global float *g, __global float* b, __global float *r2, __global float* g2, __global float *b2, int xsize, int ysize, int step)
+__kernel void edgeDetectorMap(__global float *result,
+						      __global float *r, __global float *g, __global float* b,
+						      __global float *r2, __global float* g2, __global float *b2,
+						     int xsize, int ysize, int step)
 {
-	const int result_x = get_global_id(0);
-	const int result_y = get_global_id(1);
+	const int res_x = get_global_id(0);
+	const int res_y = get_global_id(1);
 
-	const int result_xsize = get_global_size(0);
-	const int result_ysize = get_global_size(1);
+	const int res_xsize = get_global_size(0);
+	const int res_ysize = get_global_size(1);
 
-	int pos_x = result_x * step;
-	int pos_y = result_y * step;
+	int pos_x = res_x * step;
+	int pos_y = res_y * step;
+
+	if (res_x * step >= xsize - (8 - step)) return;
+	if (res_y * step >= ysize - (8 - step)) return;
 
 	int local_count = 0;
 	double local_xyb[3] = { 0 };
 	const double w = 0.711100840192;
 
-	//int offset[4][2] = { { 0��0}�� { 0��7}��{ 7��0}��{ 7��7} };
+	int offset[4][2] = {{0,0}, {0,7}, {7,0}, {7,7}};
 	int edgeSize = 3;
 
-	/*
-	for (int k = 0; i < 4; k++)
+	for (int k = 0; k < 4; k++)
 	{
 		int x = pos_x + offset[k][0];
 		int y = pos_y + offset[k][1];
@@ -621,17 +624,547 @@ __kernel void edgeDetectorMap(__global float *result, __global float *r, __globa
 			++local_count;
 		}
 	}
-	*/
 
-	static const double weight = 0.01617112696;
+	const double weight = 0.01617112696;
 	const double mul = weight * 8.0 / local_count;
 
-	int idx = (result_y * result_xsize + result_x) * 3;
+	int idx = (res_y * res_xsize + res_x) * 3;
 	result[idx]     = local_xyb[0];
 	result[idx + 1] = local_xyb[1];
 	result[idx + 2] = local_xyb[2];
 }
 
+__kernel void edgeDetectorLowFreq(__global float *result,
+	__global float *r, __global float *g, __global float* b,
+	__global float *r2, __global float* g2, __global float *b2,
+	int xsize, int ysize, int step)
+{
+	const int res_x = get_global_id(0);
+	const int res_y = get_global_id(1);
+
+	if (res_x < 8 / step) return;
+
+	const int res_xsize = get_global_size(0);
+	const int res_ysize = get_global_size(1);
+
+	int pos_x = (res_x - (8 / step)) * step;
+	int pos_y = res_y * step;
+
+	if (pos_x + 8 >= xsize) return;
+	if (pos_y + 8 >= ysize) return;
+
+	int ix = pos_y * xsize + pos_x;
+
+	double diff[4][3];
+	__global float* blurred0[3] = { r, g, b };
+	__global float* blurred1[3] = { r2, g2, b2 };
+
+	for (int i = 0; i < 3; ++i) {
+		int ix2 = ix + 8;
+		diff[0][i] =
+			((blurred1[i][ix] - blurred0[i][ix]) +
+			(blurred0[i][ix2] - blurred1[i][ix2]));
+		ix2 = ix + 8 * xsize;
+		diff[1][i] =
+			((blurred1[i][ix] - blurred0[i][ix]) +
+			(blurred0[i][ix2] - blurred1[i][ix2]));
+		ix2 = ix + 6 * xsize + 6;
+		diff[2][i] =
+			((blurred1[i][ix] - blurred0[i][ix]) +
+			(blurred0[i][ix2] - blurred1[i][ix2]));
+		ix2 = ix + 6 * xsize - 6;
+		diff[3][i] = pos_x < 8 ? 0 :
+			((blurred1[i][ix] - blurred0[i][ix]) +
+			(blurred0[i][ix2] - blurred1[i][ix2]));
+	}
+	double max_diff_xyb[3] = { 0 };
+	for (int k = 0; k < 4; ++k) {
+		double diff_xyb[3] = { 0 };
+		XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2],
+			0, 0, 0, 1.0,
+			diff_xyb);
+		for (int i = 0; i < 3; ++i) {
+			max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]);
+		}
+	}
+
+	int res_ix = res_y * res_xsize + res_x;
+
+	const double kMul = 10;
+
+	result[res_ix * 3] = max_diff_xyb[0] * kMul;
+	result[res_ix * 3 + 1] = max_diff_xyb[1] * kMul;
+	result[res_ix * 3 + 2] = max_diff_xyb[2] * kMul;
+}
+
+#define kBlockEdge 8
+#define kBlockSize (kBlockEdge * kBlockEdge)
+#define kBlockEdgeHalf  (kBlockEdge / 2)
+#define kBlockHalf (kBlockEdge * kBlockEdgeHalf)
+
+__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
+	5.28270670524,
+	0.0,
+	0.0,
+	0.0,
+	0.3831134973,
+	0.676303603859,
+	3.58927792424,
+	18.6104367002,
+	18.6104367002,
+	3.09093131948,
+	1.0,
+	0.498250875965,
+	0.36198671102,
+	0.308982169883,
+	0.1312701920435,
+	2.37370549629,
+	3.58927792424,
+	1.0,
+	2.37370549629,
+	0.991205724152,
+	1.05178802919,
+	0.627264168628,
+	0.4,
+	0.1312701920435,
+	0.676303603859,
+	0.498250875965,
+	0.991205724152,
+	0.5,
+	0.3831134973,
+	0.349686450518,
+	0.627264168628,
+	0.308982169883,
+	0.3831134973,
+	0.36198671102,
+	1.05178802919,
+	0.3831134973,
+	0.12,
+};
+
+typedef struct __Complex
+{
+	double real;
+	double imag;
+}Complex;
+
+constant double kSqrtHalf = 0.70710678118654752440084436210484903;
+
+void RealFFT8(const double* in, Complex* out) {
+	double t1, t2, t3, t5, t6, t7, t8;
+	t8 = in[6];
+	t5 = in[2] - t8;
+	t8 += in[2];
+	out[2].real = t8;
+	out[6].imag = -t5;
+	out[4].imag = t5;
+	t8 = in[4];
+	t3 = in[0] - t8;
+	t8 += in[0];
+	out[0].real = t8;
+	out[4].real = t3;
+	out[6].real = t3;
+	t7 = in[5];
+	t3 = in[1] - t7;
+	t7 += in[1];
+	out[1].real = t7;
+	t8 = in[7];
+	t5 = in[3] - t8;
+	t8 += in[3];
+	out[3].real = t8;
+	t2 = -t5;
+	t6 = t3 - t5;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	out[5].real = out[4].real - t6;
+	t1 = t3 + t5;
+	t1 *= t8;
+	out[5].imag = out[4].imag - t1;
+	t6 += out[4].real;
+	out[4].real = t6;
+	t1 += out[4].imag;
+	out[4].imag = t1;
+	t5 = t2 - t3;
+	t5 *= t8;
+	out[7].imag = out[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	out[7].real = out[6].real - t2;
+	t2 += out[6].real;
+	out[6].real = t2;
+	t5 += out[6].imag;
+	out[6].imag = t5;
+	t5 = out[2].real;
+	t1 = out[0].real - t5;
+	t7 = out[3].real;
+	t5 += out[0].real;
+	t3 = out[1].real - t7;
+	t7 += out[1].real;
+	t8 = t5 + t7;
+	out[0].real = t8;
+	t5 -= t7;
+	out[1].real = t5;
+	out[2].imag = t3;
+	out[3].imag = -t3;
+	out[3].real = t1;
+	out[2].real = t1;
+	out[0].imag = 0;
+	out[1].imag = 0;
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	Complex tmp = out[2];
+	out[2] = out[3];
+	out[3] = out[5];
+	out[5] = out[7];
+	out[7] = out[4];
+	out[4] = out[1];
+	out[1] = out[6];
+	out[6] = tmp;
+}
+
+void TransposeBlock(Complex data[kBlockSize]) {
+	for (int i = 0; i < kBlockEdge; i++) {
+		for (int j = 0; j < i; j++) {
+			Complex tmp = data[kBlockEdge * i + j];
+			data[kBlockEdge * i + j] = data[kBlockEdge * j + i];
+			data[kBlockEdge * j + i] = tmp;
+		}
+	}
+}
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements.
+inline void FFT4(Complex* a) {
+	double t1, t2, t3, t4, t5, t6, t7, t8;
+	t5 = a[2].real;
+	t1 = a[0].real - t5;
+	t7 = a[3].real;
+	t5 += a[0].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	t8 = t5 + t7;
+	a[0].real = t8;
+	t5 -= t7;
+	a[1].real = t5;
+	t6 = a[2].imag;
+	t2 = a[0].imag - t6;
+	t6 += a[0].imag;
+	t5 = a[3].imag;
+	a[2].imag = t2 + t3;
+	t2 -= t3;
+	a[3].imag = t2;
+	t4 = a[1].imag - t5;
+	a[3].real = t1 + t4;
+	t1 -= t4;
+	a[2].real = t1;
+	t5 += a[1].imag;
+	a[0].imag = t6 + t5;
+	t6 -= t5;
+	a[1].imag = t6;
+}
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements.
+void FFT8(Complex* a) {
+	double t1, t2, t3, t4, t5, t6, t7, t8;
+
+	t7 = a[4].imag;
+	t4 = a[0].imag - t7;
+	t7 += a[0].imag;
+	a[0].imag = t7;
+
+	t8 = a[6].real;
+	t5 = a[2].real - t8;
+	t8 += a[2].real;
+	a[2].real = t8;
+
+	t7 = a[6].imag;
+	a[6].imag = t4 - t5;
+	t4 += t5;
+	a[4].imag = t4;
+
+	t6 = a[2].imag - t7;
+	t7 += a[2].imag;
+	a[2].imag = t7;
+
+	t8 = a[4].real;
+	t3 = a[0].real - t8;
+	t8 += a[0].real;
+	a[0].real = t8;
+
+	a[4].real = t3 - t6;
+	t3 += t6;
+	a[6].real = t3;
+
+	t7 = a[5].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	a[1].real = t7;
+
+	t8 = a[7].imag;
+	t6 = a[3].imag - t8;
+	t8 += a[3].imag;
+	a[3].imag = t8;
+	t1 = t3 - t6;
+	t3 += t6;
+
+	t7 = a[5].imag;
+	t4 = a[1].imag - t7;
+	t7 += a[1].imag;
+	a[1].imag = t7;
+
+	t8 = a[7].real;
+	t5 = a[3].real - t8;
+	t8 += a[3].real;
+	a[3].real = t8;
+
+	t2 = t4 - t5;
+	t4 += t5;
+
+	t6 = t1 - t4;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	a[5].real = a[4].real - t6;
+	t1 += t4;
+	t1 *= t8;
+	a[5].imag = a[4].imag - t1;
+	t6 += a[4].real;
+	a[4].real = t6;
+	t1 += a[4].imag;
+	a[4].imag = t1;
+
+	t5 = t2 - t3;
+	t5 *= t8;
+	a[7].imag = a[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	a[7].real = a[6].real - t2;
+	t2 += a[6].real;
+	a[6].real = t2;
+	t5 += a[6].imag;
+	a[6].imag = t5;
+
+	FFT4(a);
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	Complex tmp = a[2];
+	a[2] = a[3];
+	a[3] = a[5];
+	a[5] = a[7];
+	a[7] = a[4];
+	a[4] = a[1];
+	a[1] = a[6];
+	a[6] = tmp;
+}
+
+double abssq(const Complex c) {
+	return c.real * c.real + c.imag * c.imag;
+}
+
+void ButteraugliFFTSquared(double block[kBlockSize]) {
+	double global_mul = 0.000064;
+	Complex block_c[kBlockSize];
+
+	for (int y = 0; y < kBlockEdge; ++y) {
+		RealFFT8(block + y * kBlockEdge, block_c + y * kBlockEdge);
+	}
+	TransposeBlock(block_c);
+	double r0[kBlockEdge];
+	double r1[kBlockEdge];
+	for (int x = 0; x < kBlockEdge; ++x) {
+		r0[x] = block_c[x].real;
+		r1[x] = block_c[kBlockHalf + x].real;
+	}
+	RealFFT8(r0, block_c);
+	RealFFT8(r1, block_c + kBlockHalf);
+	for (int y = 1; y < kBlockEdgeHalf; ++y) {
+		FFT8(block_c + y * kBlockEdge);
+	}
+	for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		block[i] = abssq(block_c[i]);
+		block[i] *= global_mul;
+	}
+}
+
+__constant double MakeHighFreqColorDiffDy_off = 1.4103373714040413;
+__constant double MakeHighFreqColorDiffDy_inc = 0.7084088867024;
+__constant double MakeHighFreqColorDiffDy_lut[21] ={
+	0.0,
+	MakeHighFreqColorDiffDy_off,
+	MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
+};
+
+double RemoveRangeAroundZero(double v, double range) {
+	if (v >= -range && v < range) {
+		return 0;
+	}
+	if (v < 0) {
+		return v + range;
+	}
+	else {
+		return v - range;
+	}
+}
+
+// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
+// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
+// diff on the edges to diff_xyb_edge_dc.
+void ButteraugliBlockDiff(double xyb0[3 * kBlockSize],
+	double xyb1[3 * kBlockSize],
+	double diff_xyb_dc[3],
+	double diff_xyb_ac[3],
+	double diff_xyb_edge_dc[3]) {
+
+	double avgdiff_xyb[3] = { 0.0 };
+	double avgdiff_edge[3][4] = { { 0.0 } };
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		const double diff_xyb = xyb0[i] - xyb1[i];
+		const int c = i / kBlockSize;
+		avgdiff_xyb[c] += diff_xyb / kBlockSize;
+		const int k = i % kBlockSize;
+		const int kx = k % kBlockEdge;
+		const int ky = k / kBlockEdge;
+		const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1;
+		const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1;
+		if (h_edge_idx >= 0) {
+			avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge;
+		}
+		if (v_edge_idx >= 0) {
+			avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge;
+		}
+	}
+	XybDiffLowFreqSquaredAccumulate(avgdiff_xyb[0],
+		avgdiff_xyb[1],
+		avgdiff_xyb[2],
+		0, 0, 0, csf8x8[0],
+		diff_xyb_dc);
+	for (int i = 0; i < 4; ++i) {
+		XybDiffLowFreqSquaredAccumulate(avgdiff_edge[0][i],
+			avgdiff_edge[1][i],
+			avgdiff_edge[2][i],
+			0, 0, 0, csf8x8[0],
+			diff_xyb_edge_dc);
+	}
+
+	double* xyb_avg = xyb0;
+	double* xyb_halfdiff = xyb1;
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		double avg = (xyb0[i] + xyb1[i]) / 2;
+		double halfdiff = (xyb0[i] - xyb1[i]) / 2;
+		xyb_avg[i] = avg;
+		xyb_halfdiff[i] = halfdiff;
+	}
+	double *y_avg = &xyb_avg[kBlockSize];
+	double *x_halfdiff_squared = &xyb_halfdiff[0];
+	double *y_halfdiff = &xyb_halfdiff[kBlockSize];
+	double *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize];
+	ButteraugliFFTSquared(y_avg);
+	ButteraugliFFTSquared(x_halfdiff_squared);
+	ButteraugliFFTSquared(y_halfdiff);
+	ButteraugliFFTSquared(z_halfdiff_squared);
+
+	const double xmul = 64.8;
+	const double ymul = 1.753123908348329;
+	const double ymul2 = 1.51983458269;
+	const double zmul = 2.4;
+
+	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		double d = csf8x8[i];
+		diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i];
+		diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i];
+
+		y_avg[i] = sqrt(y_avg[i]);
+		y_halfdiff[i] = sqrt(y_halfdiff[i]);
+		double y0 = y_avg[i] - y_halfdiff[i];
+		double y1 = y_avg[i] + y_halfdiff[i];
+		// Remove the impact of small absolute values.
+		// This improves the behavior with flat noise.
+		const double ylimit = 0.04;
+		y0 = RemoveRangeAroundZero(y0, ylimit);
+		y1 = RemoveRangeAroundZero(y1, ylimit);
+		if (y0 != y1) {
+			double valy0 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y0 * ymul2);
+			double valy1 = Interpolate(&MakeHighFreqColorDiffDy_lut[0], 21, y1 * ymul2);
+			double valy = ymul * (valy0 - valy1);
+			diff_xyb_ac[1] += d * valy * valy;
+		}
+	}
+}
+
+__kernel void blockDiffMap(__global float* r, __global float* g, __global float* b,
+	__global float* r2, __global float* g2, __global float* b2,
+	__global float* block_diff_dc, __global float* block_diff_ac,
+	int xsize, int ysize, int step)
+{
+	const int res_x = get_global_id(0);
+	const int res_y = get_global_id(1);
+
+	const int res_xsize = get_global_size(0);
+	const int res_ysize = get_global_size(1);
+
+	int pos_x = res_x * step;
+	int pos_y = res_y * step;
+
+	if ((pos_x + kBlockEdge - step - 1) >= ysize) return;
+	if ((pos_y + kBlockEdge - step - 1) >= xsize) return;
+
+	size_t res_ix = res_y * res_xsize + res_x;
+	size_t offset = min(res_y * step, ysize - 8) * xsize + min(res_x * step, xsize - 8);
+
+	double block0[3 * kBlockEdge * kBlockEdge];
+	double block1[3 * kBlockEdge * kBlockEdge];
+
+	double *block0_r = &block0[0];
+	double *block0_g = &block0[kBlockEdge * kBlockEdge];
+	double *block0_b = &block0[2 * kBlockEdge * kBlockEdge];
+
+	double *block1_r = &block1[0];
+	double *block1_g = &block1[kBlockEdge * kBlockEdge];
+	double *block1_b = &block1[2 * kBlockEdge * kBlockEdge];
+
+	for (int y = 0; y < kBlockEdge; y++)
+	{
+		for (int x = 0; x < kBlockEdge; x++)
+		{
+			block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x];
+			block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x];
+			block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x];
+			block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x];
+			block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x];
+			block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x];
+		}
+	}
+
+	double diff_xyb_dc[3] = { 0.0 };
+	double diff_xyb_ac[3] = { 0.0 };
+	double diff_xyb_edge_dc[3] = { 0.0 };
+
+	ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+
+	for (int i = 0; i < 3; i++)
+	{
+		block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i];
+		block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i];
+	}
+}
 __kernel void MaskHighIntensityChange(
 	__global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
 	__global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
@@ -691,29 +1224,46 @@ __kernel void MaskHighIntensityChange(
 	xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
 
+
+__constant double XybToVals_off = 11.38708334481672;
+__constant double XybToVals_inc = 14.550189611520716;
+__constant double XybToVals_lut[21] = {
+	0,
+	XybToVals_off,
+	XybToVals_off + 1 * XybToVals_inc,
+	XybToVals_off + 2 * XybToVals_inc,
+	XybToVals_off + 3 * XybToVals_inc,
+	XybToVals_off + 4 * XybToVals_inc,
+	XybToVals_off + 5 * XybToVals_inc,
+	XybToVals_off + 6 * XybToVals_inc,
+	XybToVals_off + 7 * XybToVals_inc,
+	XybToVals_off + 8 * XybToVals_inc,
+	XybToVals_off + 9 * XybToVals_inc,
+	XybToVals_off + 10 * XybToVals_inc,
+	XybToVals_off + 11 * XybToVals_inc,
+	XybToVals_off + 12 * XybToVals_inc,
+	XybToVals_off + 13 * XybToVals_inc,
+	XybToVals_off + 14 * XybToVals_inc,
+	XybToVals_off + 15 * XybToVals_inc,
+	XybToVals_off + 16 * XybToVals_inc,
+	XybToVals_off + 17 * XybToVals_inc,
+	XybToVals_off + 18 * XybToVals_inc,
+	XybToVals_off + 19 * XybToVals_inc,
+};
+
 void XybToVals(
 	double x, double y, double z,
 	double *valx, double *valy, double *valz)
 {
-	static const double xmul = 0.758304045695;
-	static const double ymul = 2.28148649801;
-	static const double zmul = 1.87816926918;
+	const double xmul = 0.758304045695;
+    const double ymul = 2.28148649801;
+	const double zmul = 1.87816926918;
 
-	double lut[21] = { 0.0 };
-	const double off = 11.38708334481672;
-	const double inc = 14.550189611520716;
-	lut[0] = 0.0;
-	lut[1] = off;
-	for (int i = 2; i < 21; ++i) {
-		lut[i] = lut[i - 1] + inc;
-	}
-
-	*valx = Interpolate(lut, 21, x * xmul);
-	*valy = Interpolate(lut, 21, y * ymul);
+	*valx = Interpolate(&XybToVals_lut[0], 21, x * xmul);
+	*valy = Interpolate(&XybToVals_lut[0], 21, y * ymul);
 	*valz = zmul * z;
 }
 
-
 __kernel void DiffPrecompute(
 	__global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
 	__global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
@@ -778,4 +1328,111 @@ __kernel void DiffPrecompute(
 	sup1 = fabs(valsh1[2]) + fabs(valsv1[2]);
 	m = min(sup0, sup1);
 	mask_b[ix] = (float)(m);
-}
\ No newline at end of file
+}
+
+void UpsampleSquareRoot(float *diffmap, size_t xsize, size_t ysize, int step, float *diffmap_out)
+{
+	const int res_x = get_global_id(0);
+	const int res_y = get_global_id(1);
+
+	if (res_y + 8 - step >= ysize) return;
+	if (res_x + 8 - step >= xsize) return;
+
+	int s2 = (8 - step) / 2;
+	// Upsample and take square root.
+	const size_t res_xsize = (xsize + step - 1) / step;
+	size_t res_ix = (res_y * res_xsize + res_x) / step;
+	float orig_val = diffmap[res_ix];
+	const float kInitialSlope = 100;
+	// TODO(b/29974893): Until that is fixed do not call sqrt on very small
+	// numbers.
+	double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+		? kInitialSlope * orig_val
+		: sqrt(orig_val);
+	for (size_t off_y = 0; off_y < step; ++off_y) {
+		for (size_t off_x = 0; off_x < step; ++off_x) {
+			diffmap_out[(res_y + off_y + s2) * xsize +
+				res_x + off_x + s2] = val;
+		}
+	}
+}
+
+void CalculateDiffmapGetBlurred(float *diffmap, int s, int s2, float *blurred)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	blurred[y * xsize + x] = diffmap[(y + s2) * xsize + s + x + s2];
+}
+
+void GetDiffmapFromBlurred(float *blurred, int s, int s2, float *diffmap)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	const double mul1 = 24.8235314874;
+	diffmap[(y + s2) * xsize + x + s2]	+= (float)(mul1) * blurred[y * (xsize - s) + x];
+
+}
+
+void AverageAddImage(float *img, float *tmp0, float *tmp1)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	const int row0 = y * xsize;
+	if (x == 0) // excute once per y
+	{
+		img[row0 + 1] += tmp0[row0];
+		img[row0 + 0] += tmp0[row0 + 1];
+		img[row0 + 2] += tmp0[row0 + 1];
+
+		img[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
+		img[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
+		img[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
+
+		if (y > 0) {
+			const int rowd1 = row0 - xsize;
+			img[rowd1 + 1] += tmp1[row0];
+			img[rowd1 + 0] += tmp0[row0];
+
+			img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+		if (y + 1 < ysize) {
+			const int rowu1 = row0 + xsize;
+			img[rowu1 + 1] += tmp1[row0];
+			img[rowu1 + 0] += tmp0[row0];
+
+			img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+	}
+
+	if (x >= 2 && x < xsize - 2)
+	{
+		img[row0 + x - 1] += tmp0[row0 + x];
+		img[row0 + x + 1] += tmp0[row0 + x];
+	}
+
+	if (x >= 1 && x < xsize - 1) {
+		if (y > 0) {
+			const int rowd1 = row0 - xsize;
+			img[rowd1 + x + 1] += tmp1[row0 + x];
+			img[rowd1 + x + 0] += tmp0[row0 + x];
+			img[rowd1 + x - 1] += tmp1[row0 + x];
+		}
+		if (y + 1 < ysize) {
+			const int rowu1 = row0 + xsize;
+			img[rowu1 + x + 1] += tmp1[row0 + x];
+			img[rowu1 + x + 0] += tmp0[row0 + x];
+			img[rowu1 + x - 1] += tmp1[row0 + x];
+		}
+	}
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 4ec86456..08d29fb7 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -63,6 +63,12 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err);
 	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err);
 	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err);
+	ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED] = clCreateKernel(ocl.program, "CalculateDiffmapGetBlurred", &err);
+	ocl.kernel[KERNEL_GETDIFFMAPFROMBLURRED] = clCreateKernel(ocl.program, "GetDiffmapFromBlurred", &err);
+	ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err);
+	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "edgeDetectorMap", &err);
+	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "blockDiffMap", &err);
+	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "edgeDetectorLowFreq", &err);
 
 	return ocl;
 }
@@ -471,7 +477,6 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/,
 	}
 }
 
-// strong todo
 void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
@@ -489,40 +494,134 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size
 		clBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
 		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
 	}
-	// EdgeDetectorLowFreq
 
+	cl_int clxsize = xsize;
+	cl_int clysize = ysize;
+	cl_int clstep = step;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), &result);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b);
+	clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize);
+	clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize);
+	clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep);
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (xsize + step - 1) / step;
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize};
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clEdgeDetectorMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clEdgeDetectorMapEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	ocl.releaseMemChannels(rgb_blured);
+	ocl.releaseMemChannels(rgb2_blured);
 }
 
-// strong todo
 void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize,
+	size_t xsize, size_t ysize, size_t step,
 	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxsize = xsize;
+	cl_int clysize = ysize;
+	cl_int clstep = step;
 
+	cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), &rgb.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb2.r);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2.g);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), &block_diff_dc);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), &block_diff_ac);
+	clSetKernelArg(kernel, 8, sizeof(cl_int), &clxsize);
+	clSetKernelArg(kernel, 9, sizeof(cl_int), &clysize);
+	clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep);
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (xsize + step - 1) / step;
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBlockDiffMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clBlockDiffMapEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
 }
 
-// strong todo
 void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize,
+	size_t xsize, size_t ysize, size_t step,
 	cl_mem block_diff_ac/*out*/)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
 	static const double kSigma = 14;
-	static const double kMul = 10;
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
 	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
 
-	//static const double kSigma[3] = { 1.5, 0.586, 0.4 };
-
 	for (int i = 0; i < 3; i++)
 	{
 		clBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
 		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
 	}
+
+	cl_int clxsize = xsize;
+	cl_int clysize = ysize;
+	cl_int clstep = step;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_ac);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b);
+	clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize);
+	clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize);
+	clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep);
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (xsize + step - 1) / step;
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clEdgeDetectorLowFreqEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clEdgeDetectorLowFreqEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+
+	ocl.releaseMemChannels(rgb_blured);
+	ocl.releaseMemChannels(rgb2_blured);
 }
 
 void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/)
@@ -579,7 +678,29 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w)
 	}
 }
 
-// ian todo
+void clAverageAddImage(cl_mem img, cl_mem tmp0, cl_mem tmp1, size_t xsize, size_t ysize)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_kernel kernel = ocl.kernel[KERNEL_AVERAGEADDIMAGE];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp0);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&tmp1);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clAverageAddImage() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clAverageAddImage() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
 void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
 {
 	if (xsize < 4 || ysize < 4) {
@@ -606,45 +727,8 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
 	static const float scale = 1.0f / (5.0f + 4 * w);
 
 	clScaleImageEx(tmp1, xsize * ysize, w);
-	/* TODO
-	for (int y = 0; y < ysize; y++) {
-		const int row0 = y * xsize;
-		result[row0 + 1] += tmp0[row0];
-		result[row0 + 0] += tmp0[row0 + 1];
-		result[row0 + 2] += tmp0[row0 + 1];
-		for (int x = 2; x < xsize - 2; ++x) {
-			result[row0 + x - 1] += tmp0[row0 + x];
-			result[row0 + x + 1] += tmp0[row0 + x];
-		}
-		result[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
-		result[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
-		result[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
-		if (y > 0) {
-			const int rowd1 = row0 - xsize;
-			result[rowd1 + 1] += tmp1[row0];
-			result[rowd1 + 0] += tmp0[row0];
-			for (int x = 1; x < xsize - 1; ++x) {
-				result[rowd1 + x + 1] += tmp1[row0 + x];
-				result[rowd1 + x + 0] += tmp0[row0 + x];
-				result[rowd1 + x - 1] += tmp1[row0 + x];
-			}
-			result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
-			result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
-		}
-		if (y + 1 < ysize) {
-			const int rowu1 = row0 + xsize;
-			result[rowu1 + 1] += tmp1[row0];
-			result[rowu1 + 0] += tmp0[row0];
-			for (int x = 1; x < xsize - 1; ++x) {
-				result[rowu1 + x + 1] += tmp1[row0 + x];
-				result[rowu1 + x + 0] += tmp0[row0 + x];
-				result[rowu1 + x - 1] += tmp1[row0 + x];
-			}
-			result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
-			result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
-		}
-	}
-	*/
+	clAverageAddImage(result, tmp0, tmp1, xsize, ysize);
+
 	err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, len, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -757,16 +841,17 @@ void clCombineChannelsEx(
 	cl_mem edge_detector_map,
 	size_t xsize, size_t ysize,
 	size_t step,
-	size_t res_xsize,
 	cl_mem result/*out*/)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (xsize + step - 1) / step;
+
 	cl_int clxsize = xsize;
 	cl_int clysize = ysize;
 	cl_int clstep = step;
-	cl_int clres_xsize = res_xsize;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
@@ -781,10 +866,9 @@ void clCombineChannelsEx(
 	clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize);
 	clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize);
 	clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clstep);
-	clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_xsize);
-	clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&result);
 
-	size_t globalWorkSize[2] = { xsize / step, ysize /step };
+	size_t globalWorkSize[2] = { res_xsize, res_ysize};
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -797,61 +881,115 @@ void clCombineChannelsEx(
 	}
 }
 
-// strong todo
-void clCalculateDiffmapEx(cl_mem result/*in,out*/, size_t xsize, size_t ysize, int step)
+void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step)
 {
-/*
-    int s2 = (8 - step) / 2;
-    {
-        // Upsample and take square root.
-        std::vector<float> diffmap_out(xsize * ysize);
-        const size_t res_xsize = (xsize + step - 1) / step;
-        for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) {
-            for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) {
-                size_t res_ix = (res_y * res_xsize + res_x) / step;
-                float orig_val = (*diffmap)[res_ix];
-                constexpr float kInitialSlope = 100;
-                // TODO(b/29974893): Until that is fixed do not call sqrt on very small
-                // numbers.
-                double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
-                    ? kInitialSlope * orig_val
-                    : std::sqrt(orig_val);
-                for (size_t off_y = 0; off_y < step; ++off_y) {
-                    for (size_t off_x = 0; off_x < step; ++off_x) {
-                        diffmap_out[(res_y + off_y + s2) * xsize +
-                            res_x + off_x + s2] = val;
-                    }
-                }
-            }
-        }
-        *diffmap = diffmap_out;
-    }
-*/
-    static const double kSigma = 8.8510880283;
-    static const double mul1 = 24.8235314874;
-    static const double scale = 1.0 / (1.0 + mul1);
-    const int s = 8 - step;
-    const int s2 = (8 - step) / 2;
-
-    cl_mem blurred;
-/*
-    for (size_t y = 0; y < ysize - s; ++y) {
-    for (size_t x = 0; x < xsize - s; ++x) {
-    blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2];
-    }
-    }
-*/
-    static const double border_ratio = 0.03027655136;
-    clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
-/*
-    for (size_t y = 0; y < ysize - s; ++y) {
-    for (size_t x = 0; x < xsize - s; ++x) {
-    (*diffmap)[(y + s2) * xsize + x + s2]
-    += static_cast<float>(mul1) * blurred[y * (xsize - s) + x];
-    }
-    }
-*/
-    clScaleImageEx(result, xsize * ysize, scale);
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxsize = xsize;
+	cl_int clysize = ysize;
+	cl_int clstep = step;
+	ocl.allocC(xsize * ysize * sizeof(float));
+
+	cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap);
+	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step);
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.dstMem);
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (xsize + step - 1) / step;
+
+	size_t globalWorkSize[2] = { res_xsize, res_ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.dstMem, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
+void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred)
+{
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int cls = s;
+	cl_int cls2 = s2;
+	cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap);
+	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&blurred);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCalculateDiffmapGetBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clCalculateDiffmapGetBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
+void clGetDiffmapFromBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred)
+{
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int cls = s;
+	cl_int cls2 = s2;
+	cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&blurred);
+	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&diffmap);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDiffmapFromBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clGetDiffmapFromBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
+void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step)
+{
+	clUpsampleSquareRootEx(diffmap, xsize, ysize, step);
+
+	static const double kSigma = 8.8510880283;
+	static const double mul1 = 24.8235314874;
+	static const double scale = 1.0 / (1.0 + mul1);
+	const int s = 8 - step;
+	int s2 = (8 - step) / 2;
+
+	ocl_args_d_t &ocl = getOcl();
+	ocl.allocA((xsize - s) * (ysize - s) * sizeof(float));
+	cl_mem blurred = ocl.srcA;
+	clCalculateDiffmapGetBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred);
+
+	static const double border_ratio = 0.03027655136;
+	clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
+	clGetDiffmapFromBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred);
+	clScaleImageEx(diffmap, xsize * ysize, scale);
 }
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
@@ -887,31 +1025,29 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, xyb1_c.b, 0, 0, channel_size, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	cl_mem edge_detector_map = ocl.allocMem(3 * xsize * ysize * sizeof(float));
-	cl_mem block_diff_dc = ocl.allocMem(3 * xsize * ysize * sizeof(float));
-	cl_mem block_diff_ac = ocl.allocMem(3 * xsize * ysize * sizeof(float));
-
+	cl_mem mem_result = ocl.allocMem(channel_size);
 	ocl_channels mask = ocl.allocMemChannels(channel_size);
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-	size_t res_xsize_; // ��Ա��������Ҫ����
-	size_t res_ysize_; // ��Ա��������Ҫ����
-	cl_mem mem_result = ocl.allocMem(channel_size);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (xsize + step - 1) / step;
+
+	cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
+	cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
+	cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
 
 	clMaskHighIntensityChangeEx(xyb0_arg, xyb1_c, xyb0, xyb1, xsize, ysize);
 
-	//clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, edge_detector_map);
-	clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, block_diff_dc, block_diff_ac);
-	clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, block_diff_ac);
+	clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, step, edge_detector_map);
+	clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
+	clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_ac);
 
 	clMaskEx(xyb0_arg, xyb1, xsize, ysize, mask, mask_dc);
 
-	size_t xsize_ = 0, ysize_ = 0; // ��Ա��������Ҫ����
-	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize_, ysize_, step, res_xsize_, mem_result);
+	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, step, mem_result);
 
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
-
 	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	memcpy(result, result_r, channel_size);
 
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 3e20a2f3..ecd3af86 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -55,6 +55,13 @@ enum KernelName {
 	KERNEL_COMBINECHANNELS,
 	KERNEL_MASKHIGHINTENSITYCHANGE,
 	KERNEL_DIFFPRECOMPUTE,
+	KERNEL_UPSAMPLESQUAREROOT,
+	KERNEL_CALCULATEDIFFMAPGETBLURRED,
+	KERNEL_GETDIFFMAPFROMBLURRED,
+	KERNEL_AVERAGEADDIMAGE,
+	KERNEL_EDGEDETECTOR,
+	KERNEL_BLOCKDIFFMAP,
+	KERNEL_EDGEDETECTORLOWFREQ,
 	KERNEL_COUNT,
 };
 

From 4aeec41db3316788059ec1c72b0dca595d996021 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 6 May 2017 15:56:27 +0800
Subject: [PATCH 033/189] test for clDiffmapOpsinDynamicsImage

---
 third_party/butteraugli/butteraugli/butteraugli.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 2fd045d8..cc624bed 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1120,14 +1120,14 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
     const std::vector<std::vector<float>> &xyb0_arg,
     std::vector<std::vector<float>> &xyb1,
     std::vector<float> &result) {
-/*
+
 	if (g_useOpenCL && xsize_ > 100 && ysize_ > 100)
 	{
 		result.resize(xsize_ * ysize_);
 		clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(),
-			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, result.data());
+			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
 	}
-*/
+
 
   if (xsize_ < 8 || ysize_ < 8) return;
   auto xyb0 = xyb0_arg;

From da654cb6cfdda98f3b9d59161b68e4014a51ebcd Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 6 May 2017 20:17:19 +0800
Subject: [PATCH 034/189] fix runtime bug

---
 clguetzli/clguetzli.cl                        | 23 +++++-----
 clguetzli/clguetzli.cpp                       | 44 +++++++++----------
 .../butteraugli/butteraugli/butteraugli.cc    |  1 +
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 86ffb63a..cb93294b 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -207,8 +207,8 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int
 
 	const int oxsize = xsize / xstep;
 
-	const int sample_x = x / xstep * xstep;
-	const int sample_y = y / ystep * ystep;
+	const int sample_x = x / xstep;
+	const int sample_y = y / ystep;
 
 	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
 }
@@ -443,10 +443,10 @@ __kernel void DoMask(
 
 }
 
-__kernel void ScaleImage(double scale, __global float *result)
+__kernel void ScaleImage(float scale, __global float *result)
 {
 	const int i = get_global_id(0);
-	result[i] *= (float)(scale);
+	result[i] *= scale;
 }
 
 double DotProduct(float u[3], double v[3]) {
@@ -582,8 +582,11 @@ __kernel void edgeDetectorMap(__global float *result,
 	int pos_x = res_x * step;
 	int pos_y = res_y * step;
 
-	if (res_x * step >= xsize - (8 - step)) return;
-	if (res_y * step >= ysize - (8 - step)) return;
+	if (pos_x >= xsize - (8 - step)) return;
+	if (pos_y >= ysize - (8 - step)) return;
+
+	pos_x = min(pos_x, xsize - 8);
+	pos_y = min(pos_y, ysize - 8);
 
 	int local_count = 0;
 	double local_xyb[3] = { 0 };
@@ -1330,7 +1333,7 @@ __kernel void DiffPrecompute(
 	mask_b[ix] = (float)(m);
 }
 
-void UpsampleSquareRoot(float *diffmap, size_t xsize, size_t ysize, int step, float *diffmap_out)
+__kernel void UpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
 {
 	const int res_x = get_global_id(0);
 	const int res_y = get_global_id(1);
@@ -1357,7 +1360,7 @@ void UpsampleSquareRoot(float *diffmap, size_t xsize, size_t ysize, int step, fl
 	}
 }
 
-void CalculateDiffmapGetBlurred(float *diffmap, int s, int s2, float *blurred)
+kernel void CalculateDiffmapGetBlurred(__global float *diffmap, int s, int s2, __global float *blurred)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
@@ -1367,7 +1370,7 @@ void CalculateDiffmapGetBlurred(float *diffmap, int s, int s2, float *blurred)
 	blurred[y * xsize + x] = diffmap[(y + s2) * xsize + s + x + s2];
 }
 
-void GetDiffmapFromBlurred(float *blurred, int s, int s2, float *diffmap)
+kernel void GetDiffmapFromBlurred(__global float *blurred, int s, int s2, __global float *diffmap)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
@@ -1379,7 +1382,7 @@ void GetDiffmapFromBlurred(float *blurred, int s, int s2, float *diffmap)
 
 }
 
-void AverageAddImage(float *img, float *tmp0, float *tmp1)
+__kernel void AverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 08d29fb7..722e56c8 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -45,15 +45,7 @@ ocl_args_d_t& getOcl(void)
         }
 	}
 	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clCreateKernel(MinSquareVal) for source program returned %s.\n", TranslateOpenCLError(err));
-	}
 	ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clCreateKernel(Convolution) for source program returned %s.\n", TranslateOpenCLError(err));
-	}
 	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err);
 	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
 	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
@@ -63,6 +55,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err);
 	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err);
 	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err);
+	ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "UpsampleSquareRoot", &err);
 	ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED] = clCreateKernel(ocl.program, "CalculateDiffmapGetBlurred", &err);
 	ocl.kernel[KERNEL_GETDIFFMAPFROMBLURRED] = clCreateKernel(ocl.program, "GetDiffmapFromBlurred", &err);
 	ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err);
@@ -316,12 +309,12 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
 	cl_int clxstep = xstep;
 	cl_int clystep = ystep;
 	cl_kernel kernel = ocl.kernel[KERNEL_DOWNSAMPLE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result);
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep);
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
 
-	size_t globalWorkSize[2] = { ysize, xsize };
+	size_t globalWorkSize[2] = { xsize, ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -367,7 +360,7 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 
 		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
 		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB);
-		clUpsampleEx(ocl.srcB, dxsize, dysize, xstep, ystep, result ? result : image);
+		clUpsampleEx(ocl.srcB, xsize, ysize, xstep, ystep, result ? result : image);
 	}
 	else
 	{
@@ -468,12 +461,12 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/,
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clMaskHighIntensityChangeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
 	err = clFinish(ocl.commandQueue);
 	if (CL_SUCCESS != err)
 	{
-		LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+		LogError("Error: clMaskHighIntensityChangeEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
 }
 
@@ -512,7 +505,7 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size
 	clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize};
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -555,7 +548,7 @@ void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
 	clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -606,7 +599,7 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 	clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -658,14 +651,13 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w)
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clsize = size;
 	cl_float clscale = w;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&clscale);
+	clSetKernelArg(kernel, 0, sizeof(cl_float), (void*)&clscale);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
 
-	size_t globalWorkSize[1] = { clsize };
+	size_t globalWorkSize[1] = { size };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -734,6 +726,8 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
 	{
 		LogError("Error: clAverage5x5Ex() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
 	}
+	err = clFinish(ocl.commandQueue);
+
 	clScaleImageEx(img, xsize * ysize, scale);
 }
 
@@ -847,7 +841,7 @@ void clCombineChannelsEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
 
 	cl_int clxsize = xsize;
 	cl_int clysize = ysize;
@@ -896,10 +890,10 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize);
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize);
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step);
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&ocl.dstMem);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -907,6 +901,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 	{
 		LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
+	err = clFinish(ocl.commandQueue);
 	err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.dstMem, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -1030,7 +1025,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
 
 	cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
 	cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
@@ -1049,6 +1044,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
 	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
 	memcpy(result, result_r, channel_size);
 
 	ocl.releaseMemChannels(xyb0_arg);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index cc624bed..4faa70c7 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1126,6 +1126,7 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
 		result.resize(xsize_ * ysize_);
 		clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(),
 			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
+		return;
 	}
 
 

From 2ceb6350ef14e9d0ce00015a64e29491e1d749ad Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 6 May 2017 20:21:34 +0800
Subject: [PATCH 035/189] remove useless code

---
 clguetzli/clguetzli.cl  | 126 --------------------------
 clguetzli/clguetzli.cpp | 196 ----------------------------------------
 clguetzli/ocl.h         |   2 -
 3 files changed, 324 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index cb93294b..6351aa7a 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -70,132 +70,6 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 
 	result[ox * ysize + y] = sum * scale;
 }
-/*
-__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
-	int len, int offset, float border_ratio)
-{
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	float weight_no_border = 0;
-	for (int j = 0; j <= 2 * offset; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
-
-	int minx = x < offset ? 0 : x - offset;
-	int maxx = min(xsize, x + len - offset);
-
-	int miny = y < offset ? 0 : y - offset;
-	int maxy = min(ysize, y + len - offset);
-
-	float weightX = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		weightX += multipliers[j - x + offset];
-	}
-
-	weightX = (1.0 - border_ratio) * weightX + border_ratio * weight_no_border;
-
-	float weightY = 0.0;
-	for (int j = miny; j < maxy; j++)
-	{
-		weightY += multipliers[j - y + offset];
-	}
-
-	weightY = (1.0 - border_ratio) * weightY + border_ratio * weight_no_border;
-
-
-	float sum = 0.0;
-	for (int j = miny; j < maxy; j++)
-	{
-		float sumx = 0.0;
-		for (int i = minx; i < maxx; i++)
-		{
-			sumx += inp[j * xsize + i] * multipliers[i - x + offset];
-		}
-
-		sum += sumx * multipliers[j - y + offset];
-	}
-
-	result[y * xsize + x] = sum / weightY / weightX;
-}
-*/
-
-__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
-	int len, int offset, float border_ratio)
-{
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	float weight_no_border = 0;
-	for (int j = 0; j <= 2 * offset; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
-
-	int minx = x < offset ? 0 : x - offset;
-	int maxx = min(xsize, x + len - offset);
-
-	float weight = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		weight += multipliers[j - x + offset];
-	}
-
-	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-	float scale = 1.0 / weight;
-
-	float sum = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		sum += inp[y * xsize + j] * multipliers[j - x + offset];
-	}
-
-	result[x * ysize + y] = sum * scale;
-}
-
-__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result,
-	int len, int offset, float border_ratio)
-{
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	float weight_no_border = 0;
-	for (int j = 0; j <= 2 * offset; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
-
-	int miny = y < offset ? 0 : y - offset;
-	int maxy = min(ysize, y + len - offset);
-
-	float weight = 0.0;
-	for (int j = miny; j < maxy; j++)
-	{
-		weight += multipliers[j - y + offset];
-	}
-
-	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-	float scale = 1.0 / weight;
-
-	float sum = 0.0;
-	for (int j = miny; j < maxy; j++)
-	{
-		sum += inp[j * xsize + x] * multipliers[j - y + offset];
-	}
-
-	result[y * xsize + x] = sum * scale;
-}
 
 __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep)
 {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 722e56c8..989c5e2f 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -46,8 +46,6 @@ ocl_args_d_t& getOcl(void)
 	}
 	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err);
 	ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err);
-	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err);
-	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
 	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
 	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
 	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
@@ -66,200 +64,6 @@ ocl_args_d_t& getOcl(void)
 	return ocl;
 }
 
-void clMinSquareVal(size_t square_size, size_t offset,
-	size_t xsize, size_t ysize,
-	float *values)
-{
-	cl_int err = CL_SUCCESS;
-	ocl_args_d_t &ocl = getOcl();
-
-	ocl.allocA(sizeof(cl_float) * xsize * ysize);
-	ocl.allocC(sizeof(cl_float) * xsize * ysize);
-
-	memcpy(ocl.inputA, values, sizeof(cl_float) * xsize * ysize);
-
-	cl_int cloffset = offset;
-	cl_int clsquare_size = square_size;
-
-	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset);
-
-	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
-	}
-
-	cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
-	}
-
-	memcpy(values, resultPtr, sizeof(cl_float) * xsize * ysize);
-}
-
-void clConvolution(size_t xsize, size_t ysize,
-	size_t xstep,
-	size_t len, size_t offset,
-	const float* multipliers,
-	const float* inp,
-	float border_ratio,
-	float* result)
-{
-	cl_int err = CL_SUCCESS;
-	ocl_args_d_t &ocl = getOcl();
-
-	size_t oxsize = xsize / xstep;
-
-	ocl.allocA(sizeof(cl_float) * len);
-	ocl.allocB(sizeof(cl_float) * xsize * ysize);
-	ocl.allocC(sizeof(cl_float) * oxsize * ysize);
-
-	memcpy(ocl.inputA, multipliers, sizeof(cl_float) * len);
-	memcpy(ocl.inputB, inp, sizeof(cl_float) * xsize * ysize);
-
-	cl_int clxsize = xsize;
-	cl_int clxstep = xstep;
-	cl_int cllen = len;
-	cl_int cloffset = offset;
-	cl_float clborder_ratio = border_ratio;
-
-	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
-
-	size_t globalWorkSize[2] = { xsize / xstep, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
-	}
-
-	cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * oxsize * ysize, 0, NULL, NULL, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
-	}
-
-	memcpy(result, resultPtr, sizeof(cl_float) * oxsize * ysize);
-}
-
-void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio)
-{
-	double m = 2.25;  // Accuracy increases when m is increased.
-	const double scaler = -1.0 / (2 * sigma * sigma);
-	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
-	const int diff = std::max<int>(1, m * fabs(sigma));
-	const int expn_size = 2 * diff + 1;
-	std::vector<float> expn(expn_size);
-	for (int i = -diff; i <= diff; ++i) {
-		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
-	}
-
-	const int xstep = std::max<int>(1, int(sigma / 3));
-
-	cl_int err = CL_SUCCESS;
-	ocl_args_d_t &ocl = getOcl();
-
-	ocl.allocA(sizeof(cl_float) * expn_size);
-	ocl.allocB(sizeof(cl_float) * xsize * ysize);
-	ocl.allocC(sizeof(cl_float) * xsize * ysize);
-
-	memcpy(ocl.inputA, expn.data(), sizeof(cl_float) * expn_size);
-	memcpy(ocl.inputB, channel, sizeof(cl_float) * xsize * ysize);
-
-	cl_int clxsize = xsize;
-	cl_int clxstep = xstep;
-	cl_int cllen = expn_size;
-	cl_int cloffset = diff;
-	cl_float clborder_ratio = border_ratio;
-
-	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcB);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
-
-	size_t globalWorkSize[2] = { xsize / xstep, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
-	globalWorkSize[0] = ysize / xstep;
-	globalWorkSize[1] = xsize / xstep;
-	clxsize = ysize;
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&ocl.srcB);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
-
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
-	cl_int clstep = xstep;
-	if (clstep <= 1)
-	{
-		cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.srcB, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
-		err = clFinish(ocl.commandQueue);
-		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
-	}
-	else
-	{
-		kernel = ocl.kernel[KERNEL_DOWNSAMPLE];
-		clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcB);
-		clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.dstMem);
-		clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clstep);
-		clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clstep);
-
-		globalWorkSize[0] = ysize;
-		globalWorkSize[1] = xsize;
-		err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-		err = clFinish(ocl.commandQueue);
-
-		cl_float *resultPtr = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, ocl.dstMem, true, CL_MAP_READ, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL, &err);
-		err = clFinish(ocl.commandQueue);
-		memcpy(channel, resultPtr, sizeof(cl_float) * xsize * ysize);
-	}
-}
-
 void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 				     cl_mem multipliers, size_t len,
                      int xstep, int offset, double border_ratio,
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index ecd3af86..7babc74e 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -46,8 +46,6 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 enum KernelName {
 	KERNEL_MINSQUAREVAL = 0,
 	KERNEL_CONVOLUTION,
-	KERNEL_CONVOLUTIONX,
-	KERNEL_CONVOLUTIONY,
 	KERNEL_DOWNSAMPLE,
 	KERNEL_OPSINDYNAMICSIMAGE,
 	KERNEL_DOMASK,

From 6981d9f5014eeee871c633ab14cfa98c29e3656c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 6 May 2017 22:29:57 +0800
Subject: [PATCH 036/189] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp                       |  59 ++++----
 clguetzli/clguetzli.h                         |  22 ++-
 clguetzli/clguetzli_test.cpp                  | 135 ++++++++++++++++++
 clguetzli/clguetzli_test.h                    |  16 +++
 clguetzli/utils.h                             |   5 -
 guetzli.vcxproj                               |   2 +
 guetzli.vcxproj.filters                       |   6 +
 .../butteraugli/butteraugli/butteraugli.cc    |  13 ++
 8 files changed, 211 insertions(+), 47 deletions(-)
 create mode 100644 clguetzli/clguetzli_test.cpp
 create mode 100644 clguetzli/clguetzli_test.h

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 989c5e2f..9b1ae457 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -2,7 +2,6 @@
 #include <algorithm>
 #include <vector>
 #include "clguetzli.h"
-#include "ocl.h"
 
 extern bool g_useOpenCL = false;
 
@@ -238,19 +237,30 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	ocl.releaseMemChannels(rgb_blurred);
 }
 
-void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/,
-                                 ocl_channels xyb1/*in,out*/,
-								 ocl_channels c0,
-								 ocl_channels c1,
+void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
+								 ocl_channels xyb1/*in,out*/,
                                  size_t xsize, size_t ysize)
 {
+	cl_int channel_size = xsize * ysize * sizeof(float);
+
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
+	ocl_channels c0 = ocl.allocMemChannels(channel_size);
+	ocl_channels c1 = ocl.allocMemChannels(channel_size);
+
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb0.r, c0.r, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb0.g, c0.g, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb0.b, c0.b, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL);
+	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
 	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0_arg.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0_arg.g);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0_arg.b);
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b);
 	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r);
 	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g);
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b);
@@ -272,6 +282,9 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0_arg/*in,out*/,
 	{
 		LogError("Error: clMaskHighIntensityChangeEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
+
+	ocl.releaseMemChannels(c0);
+	ocl.releaseMemChannels(c1);
 }
 
 void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/)
@@ -802,26 +815,16 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0_arg = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
-
 	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1_c = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0_arg.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
 
-
-	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.r, xyb0.r, 0, 0, channel_size, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.g, xyb0.g, 0, 0, channel_size, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb0_arg.b, xyb0.b, 0, 0, channel_size, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, xyb1_c.r, 0, 0, channel_size, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, xyb1_c.g, 0, 0, channel_size, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, xyb1_c.b, 0, 0, channel_size, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	cl_mem mem_result = ocl.allocMem(channel_size);
@@ -835,13 +838,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
 	cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
 
-	clMaskHighIntensityChangeEx(xyb0_arg, xyb1_c, xyb0, xyb1, xsize, ysize);
+	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
-	clEdgeDetectorMapEx(xyb0_arg, xyb1, xsize, ysize, step, edge_detector_map);
-	clBlockDiffMapEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
-	clEdgeDetectorLowFreqEx(xyb0_arg, xyb1, xsize, ysize, step, block_diff_ac);
+	clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map);
+	clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
+	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
 
-	clMaskEx(xyb0_arg, xyb1, xsize, ysize, mask, mask_dc);
+	clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
 
 	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, step, mem_result);
 
@@ -851,10 +854,8 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	err = clFinish(ocl.commandQueue);
 	memcpy(result, result_r, channel_size);
 
-	ocl.releaseMemChannels(xyb0_arg);
 	ocl.releaseMemChannels(xyb1);
 	ocl.releaseMemChannels(xyb0);
-	ocl.releaseMemChannels(xyb1_c);
 
 	clReleaseMemObject(edge_detector_map);
 	clReleaseMemObject(block_diff_dc);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 6f29dd35..91dd25ac 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,22 +1,18 @@
 #pragma once
 #include "CL\cl.h"
+#include "ocl.h"
+
 extern bool g_useOpenCL;
 
-void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
+void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
+	ocl_channels xyb1/*in,out*/,
+	size_t xsize, size_t ysize);
 
-void clMinSquareVal(size_t square_size, size_t offset,
+void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
-	float *values);
-
-void clConvolution(size_t xsize, size_t ysize,
-	size_t xstep,
-	size_t len, size_t offset,
-	const float* multiplier,
-	const float* inp,
-	float border_ratio,
-	float* result);
+	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/);
 
-void clBlur(size_t xsize, size_t ysize, float* channel, double sigma, double border_ratio);
+void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
 
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
 
@@ -24,4 +20,4 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	float* r2, float* g2, float* b2,
 	size_t xsize, size_t ysize,
 	size_t step,
-	float* result);
\ No newline at end of file
+	float* result);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
new file mode 100644
index 00000000..2ff85802
--- /dev/null
+++ b/clguetzli/clguetzli_test.cpp
@@ -0,0 +1,135 @@
+#include <CL/cl.h>
+#include <math.h>
+#include <assert.h>
+#include "clguetzli_test.h"
+#include "clguetzli.h"
+#include "ocl.h"
+
+bool floatCompare(const float* a, const float* b, size_t size)
+{
+	for (int i = 0; i < size; i++)
+	{
+		if (fabs(a[i] - b[i]) > 0.001)
+		{
+			return false;
+		}
+	}
+	return true;
+}
+
+void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b,
+	const float* result_r2, const float* result_g2, const float* result_b2)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+
+	err = clFinish(ocl.commandQueue);
+
+	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb0.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+
+	floatCompare(result_r, r0_r, xsize * ysize);
+	floatCompare(result_g, r0_g, xsize * ysize);
+	floatCompare(result_b, r0_b, xsize * ysize);
+	floatCompare(result_r2, r1_r, xsize * ysize);
+	floatCompare(result_g2, r1_g, xsize * ysize);
+	floatCompare(result_b2, r1_b, xsize * ysize);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+}
+
+// strong to
+void clEdgeDetectorMap(void)
+{
+
+}
+
+// strong todo
+void clBlockDiffMap(void)
+{
+
+}
+
+// strong to
+void clEdgeDetectorLowFreq(void)
+{
+
+}
+
+void clMask(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* mask_r, const float* mask_g, const float* mask_b,
+	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b)
+{
+	size_t channel_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb2 = ocl.allocMemChannels(channel_size);
+
+	ocl_channels mask = ocl.allocMemChannels(channel_size);
+	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+
+	err = clFinish(ocl.commandQueue);
+
+	clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/);
+
+	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+
+	floatCompare(mask_r, r0_r, xsize * ysize);
+	floatCompare(mask_g, r0_g, xsize * ysize);
+	floatCompare(mask_b, r0_b, xsize * ysize);
+	floatCompare(maskdc_r, r1_r, xsize * ysize);
+	floatCompare(maskdc_g, r1_g, xsize * ysize);
+	floatCompare(maskdc_b, r1_b, xsize * ysize);
+
+	ocl.releaseMemChannels(rgb);
+	ocl.releaseMemChannels(rgb2);
+	ocl.releaseMemChannels(mask);
+	ocl.releaseMemChannels(mask_dc);
+}
+
+// ian todo
+void clCombineChannels(void)
+{
+
+}
+
+// ian todo
+void clCalculateDiffmapEx(void)
+{
+
+}
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
new file mode 100644
index 00000000..6d3f58c2
--- /dev/null
+++ b/clguetzli/clguetzli_test.h
@@ -0,0 +1,16 @@
+#pragma once
+#include "ocl.h"
+
+ocl_args_d_t& getOcl(void);
+
+void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b,
+	const float* result_r2, const float* result_g2, const float* result_b2);
+
+void clMask(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize,
+	const float* mask_r, const float* mask_g, const float* mask_b,
+	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b);
diff --git a/clguetzli/utils.h b/clguetzli/utils.h
index 294f7137..fc68fec5 100644
--- a/clguetzli/utils.h
+++ b/clguetzli/utils.h
@@ -19,11 +19,6 @@
  * Intel Corporation is the author of the Materials, and requests that all
  * problem reports or change requests be submitted to it directly
  *****************************************************************************/
-
-#include "CL\cl.h"
-#include <d3d9.h>
-
-
 #pragma once
 
 // Print useful information to the default output. Same usage as with printf
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 3aa98abf..e48d1682 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -192,6 +192,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="clguetzli\clguetzli.h" />
+    <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\ocl.h" />
     <ClInclude Include="clguetzli\utils.h" />
     <ClInclude Include="guetzli\butteraugli_comparator.h" />
@@ -286,6 +287,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="clguetzli\clguetzli.cpp" />
+    <ClCompile Include="clguetzli\clguetzli_test.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
     <ClCompile Include="clguetzli\utils.cpp" />
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 308cad47..a74b94c9 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -303,6 +303,9 @@
     <ClInclude Include="clguetzli\clguetzli.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli_test.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -563,6 +566,9 @@
     <ClCompile Include="clguetzli\clguetzli.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli_test.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 4faa70c7..4ac771dd 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -41,6 +41,7 @@
 #include <array>
 
 #include "clguetzli\clguetzli.h"
+#include "clguetzli\clguetzli_test.h"
 
 // Restricted pointers speed up Convolution(); MSVC uses a different keyword.
 #ifdef _MSC_VER
@@ -828,6 +829,12 @@ void MaskHighIntensityChange(
       }
     }
   }
+
+  clMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
+	  c1[0].data(), c1[1].data(), c1[2].data(),
+	  xsize, ysize,
+	  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+	  xyb0[0].data(), xyb1[1].data(), xyb1[2].data());
 }
 
 double SimpleGamma(double v) {
@@ -1609,6 +1616,12 @@ void Mask(const std::vector<std::vector<float> > &xyb0,
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]);
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
   }
+
+  clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	  xsize, ysize,
+	  (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+	  (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
 }
 
 }  // namespace butteraugli

From 7e1ad82e99777e372fe0792efd7236852916e820 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 7 May 2017 13:25:32 +0800
Subject: [PATCH 037/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.h                         |  10 ++
 clguetzli/clguetzli_test.cpp                  | 118 ++++++++++++++++--
 clguetzli/clguetzli_test.h                    |  15 +++
 .../butteraugli/butteraugli/butteraugli.cc    |  14 +++
 4 files changed, 149 insertions(+), 8 deletions(-)

diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 91dd25ac..a714bf44 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -12,6 +12,16 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
 	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/);
 
+void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/);
+
+void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
+	size_t xsize, size_t ysize, size_t step,
+	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/);
+
+void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
+	size_t xsize, size_t ysize, size_t step,
+	cl_mem block_diff_ac/*out*/);
+
 void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
 
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 2ff85802..eeabe8a8 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -5,16 +5,17 @@
 #include "clguetzli.h"
 #include "ocl.h"
 
-bool floatCompare(const float* a, const float* b, size_t size)
+int floatCompare(const float* a, const float* b, size_t size)
 {
+	int count = 0;
 	for (int i = 0; i < size; i++)
 	{
 		if (fabs(a[i] - b[i]) > 0.001)
 		{
-			return false;
+			count++;
 		}
 	}
-	return true;
+	return count;
 }
 
 void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
@@ -35,7 +36,6 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-
 	err = clFinish(ocl.commandQueue);
 
 	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
@@ -46,6 +46,7 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
 
 	floatCompare(result_r, r0_r, xsize * ysize);
 	floatCompare(result_g, r0_g, xsize * ysize);
@@ -59,21 +60,122 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 }
 
 // strong to
-void clEdgeDetectorMap(void)
+void clEdgeDetectorMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result)
 {
+	size_t channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t edgemap_size = res_xsize * res_ysize * 3 * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
+	cl_mem edge = ocl.allocMem(edgemap_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge);
 
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	floatCompare(result, r_r, res_xsize * res_ysize * 3);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+	clReleaseMemObject(edge);
 }
 
 // strong todo
-void clBlockDiffMap(void)
+void clBlockDiffMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result_diff_dc, const float* result_diff_ac)
 {
+	size_t channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
 
+	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
+	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
+
+	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+	floatCompare(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+
+	clReleaseMemObject(block_diff_ac);
+	clReleaseMemObject(block_diff_dc);
 }
 
 // strong to
-void clEdgeDetectorLowFreq(void)
+void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result_diff_dc)
 {
+	size_t channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t reschannel_size = res_xsize * res_ysize * 3 * sizeof(float);
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
+
+	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
 
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc);
+
+	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+
+	ocl.releaseMemChannels(xyb0);
+	ocl.releaseMemChannels(xyb1);
+
+	clReleaseMemObject(block_diff_dc);
 }
 
 void clMask(const float* r, const float* g, const float* b,
@@ -97,7 +199,6 @@ void clMask(const float* r, const float* g, const float* b,
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-
 	err = clFinish(ocl.commandQueue);
 
 	clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/);
@@ -108,6 +209,7 @@ void clMask(const float* r, const float* g, const float* b,
 	cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
 
 	floatCompare(mask_r, r0_r, xsize * ysize);
 	floatCompare(mask_g, r0_g, xsize * ysize);
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 6d3f58c2..3b62144e 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -9,6 +9,21 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2);
 
+void clEdgeDetectorMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result);
+
+void clBlockDiffMap(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result_diff_dc, const float* result_diff_ac);
+
+void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
+	const float* r2, const float* g2, const float* b2,
+	size_t xsize, size_t ysize, size_t step,
+	const float* result_diff_dc);
+
 void clMask(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize,
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 4ac771dd..138234ad 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1200,6 +1200,10 @@ void ButteraugliComparator::BlockDiffMap(
       }
     }
   }
+  clBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	  xsize_, ysize_, step_,
+	  (*block_diff_dc).data(), (*block_diff_ac).data());
 }
 
 void ButteraugliComparator::EdgeDetectorMap(
@@ -1232,6 +1236,11 @@ void ButteraugliComparator::EdgeDetectorMap(
       }
     }
   }
+
+  clEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	  xsize_, ysize_, step_,
+	  (*edge_detector_map).data());
 }
 
 void ButteraugliComparator::EdgeDetectorLowFreq(
@@ -1288,6 +1297,11 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
       }
     }
   }
+
+  clEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	  xsize_, ysize_, step_,
+	  (*block_diff_ac).data());
 }
 
 void ButteraugliComparator::CombineChannels(

From 7ef1b6dccad8fa0c68ae0e18abcf1e5565594ae8 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 7 May 2017 14:05:44 +0800
Subject: [PATCH 038/189] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B=E6=A1=86=E6=9E=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli_test.cpp | 108 +++++++++++++++++++++++++++++------
 1 file changed, 91 insertions(+), 17 deletions(-)

diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index eeabe8a8..6718a17e 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -5,7 +5,9 @@
 #include "clguetzli.h"
 #include "ocl.h"
 
-int floatCompare(const float* a, const float* b, size_t size)
+#define FLOAT_COMPARE(a, b, c)  floatCompare((a), (b), (c), __FUNCTION__, __LINE__ )
+
+int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line)
 {
 	int count = 0;
 	for (int i = 0; i < size; i++)
@@ -15,6 +17,10 @@ int floatCompare(const float* a, const float* b, size_t size)
 			count++;
 		}
 	}
+	if (count > 0)
+	{
+		LogError("CHK %s(%d) %d:%d\r\n", szFunc, line, count, size);
+	}
 	return count;
 }
 
@@ -48,12 +54,20 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, xyb1.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	floatCompare(result_r, r0_r, xsize * ysize);
-	floatCompare(result_g, r0_g, xsize * ysize);
-	floatCompare(result_b, r0_b, xsize * ysize);
-	floatCompare(result_r2, r1_r, xsize * ysize);
-	floatCompare(result_g2, r1_g, xsize * ysize);
-	floatCompare(result_b2, r1_b, xsize * ysize);
+	FLOAT_COMPARE(result_r, r0_r, xsize * ysize);
+	FLOAT_COMPARE(result_g, r0_g, xsize * ysize);
+	FLOAT_COMPARE(result_b, r0_b, xsize * ysize);
+	FLOAT_COMPARE(result_r2, r1_r, xsize * ysize);
+	FLOAT_COMPARE(result_g2, r1_g, xsize * ysize);
+	FLOAT_COMPARE(result_b2, r1_b, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, channel_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
 	ocl.releaseMemChannels(xyb1);
@@ -89,7 +103,10 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b,
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	floatCompare(result, r_r, res_xsize * res_ysize * 3);
+	FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, edgemap_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
 	ocl.releaseMemChannels(xyb1);
@@ -129,8 +146,12 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
-	floatCompare(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
+	FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+	FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
 	ocl.releaseMemChannels(xyb1);
@@ -170,7 +191,10 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	floatCompare(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+	FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
 	ocl.releaseMemChannels(xyb1);
@@ -211,12 +235,20 @@ void clMask(const float* r, const float* g, const float* b,
 	cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	floatCompare(mask_r, r0_r, xsize * ysize);
-	floatCompare(mask_g, r0_g, xsize * ysize);
-	floatCompare(mask_b, r0_b, xsize * ysize);
-	floatCompare(maskdc_r, r1_r, xsize * ysize);
-	floatCompare(maskdc_g, r1_g, xsize * ysize);
-	floatCompare(maskdc_b, r1_b, xsize * ysize);
+	FLOAT_COMPARE(mask_r, r0_r, xsize * ysize);
+	FLOAT_COMPARE(mask_g, r0_g, xsize * ysize);
+	FLOAT_COMPARE(mask_b, r0_b, xsize * ysize);
+	FLOAT_COMPARE(maskdc_r, r1_r, xsize * ysize);
+	FLOAT_COMPARE(maskdc_g, r1_g, xsize * ysize);
+	FLOAT_COMPARE(maskdc_b, r1_b, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, channel_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(rgb);
 	ocl.releaseMemChannels(rgb2);
@@ -235,3 +267,45 @@ void clCalculateDiffmapEx(void)
 {
 
 }
+
+//
+void clBlur(void)
+{
+
+}
+
+//
+void clConvolution(void)
+{
+
+}
+
+//
+void clUpsample(void)
+{
+
+}
+
+//
+void clDiffPrecompute(void)
+{
+
+}
+
+//
+void clAverage5x5(void)
+{
+
+}
+
+//
+void clMinSquareVal(void)
+{
+
+}
+
+//
+void clScaleImage(void)
+{
+
+}

From 8d356925322225733afe7ae73b51e833acfb7400 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 7 May 2017 16:56:09 +0800
Subject: [PATCH 039/189] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?=
 =?UTF-8?q?=E5=88=86=E5=B7=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli_test.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 6718a17e..b14eb15d 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -268,43 +268,43 @@ void clCalculateDiffmapEx(void)
 
 }
 
-//
+// strong todo
 void clBlur(void)
 {
 
 }
 
-//
+// strong todo
 void clConvolution(void)
 {
 
 }
 
-//
+// strong todo
 void clUpsample(void)
 {
 
 }
 
-//
+// ian todo
 void clDiffPrecompute(void)
 {
 
 }
 
-//
+// ian todo
 void clAverage5x5(void)
 {
 
 }
 
-//
+// strong todo
 void clMinSquareVal(void)
 {
 
 }
 
-//
+// ian todo
 void clScaleImage(void)
 {
 

From 5864a11ba25c134967551bcea6d2c36050bcf107 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 7 May 2017 21:06:44 +0800
Subject: [PATCH 040/189] =?UTF-8?q?MapBuffer=E4=B9=8B=E5=90=8E=E8=A6=81?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8CUnmap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 9b1ae457..487d260d 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -233,6 +233,11 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	memcpy(g, result_g, channel_size);
 	memcpy(b, result_b, channel_size);
 
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, channel_size, NULL, NULL);
+	clFinish(ocl.commandQueue);
+
     ocl.releaseMemChannels(rgb);
 	ocl.releaseMemChannels(rgb_blurred);
 }
@@ -854,6 +859,9 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	err = clFinish(ocl.commandQueue);
 	memcpy(result, result_r, channel_size);
 
+	clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, channel_size, NULL, NULL);
+	clFinish(ocl.commandQueue);
+
 	ocl.releaseMemChannels(xyb1);
 	ocl.releaseMemChannels(xyb0);
 

From 8474de05eab0784518f0897b20f7e19953995214 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 7 May 2017 21:46:39 +0800
Subject: [PATCH 041/189] =?UTF-8?q?=E5=85=88=E6=8E=92=E6=9F=A5>100*100?=
 =?UTF-8?q?=E7=9A=84=E8=AE=A1=E7=AE=97=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli_test.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index b14eb15d..e7410a52 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -30,6 +30,8 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2)
 {
+	if (xsize < 100 || ysize < 100) return;
+
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -79,6 +81,8 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result)
 {
+	if (xsize < 100 || ysize < 100) return;
+
 	size_t channel_size = xsize * ysize * sizeof(float);
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -119,6 +123,8 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc, const float* result_diff_ac)
 {
+	if (xsize < 100 || ysize < 100) return;
+
 	size_t channel_size = xsize * ysize * sizeof(float);
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -166,6 +172,8 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc)
 {
+	if (xsize < 100 || ysize < 100) return;
+
 	size_t channel_size = xsize * ysize * sizeof(float);
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -208,6 +216,8 @@ void clMask(const float* r, const float* g, const float* b,
 	const float* mask_r, const float* mask_g, const float* mask_b,
 	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b)
 {
+	if (xsize < 100 || ysize < 100) return;
+
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();

From 1e8972fb31ccdd2b2b30abb6619fd0331e78803a Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Sun, 7 May 2017 23:55:38 +0800
Subject: [PATCH 042/189] Remove _constant for opencl 1.2

---
 clguetzli/clguetzli.cl | 158 +++++++++++++++++++++--------------------
 1 file changed, 80 insertions(+), 78 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 6351aa7a..59f80050 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -364,7 +364,7 @@ __kernel void CombineChannels(
 		DotProduct((float *)&edge_detector_map[3 * res_ix], mask));
 }
 
-inline double Interpolate(__constant double *array, int size, double sx) {
+inline double Interpolate(const double *array, int size, double sx) {
 	double ix = fabs(sx);
 
 	int baseix = (int)(ix);
@@ -381,37 +381,38 @@ inline double Interpolate(__constant double *array, int size, double sx) {
 	return res;
 }
 
-__constant double XybLowFreqToVals_inc = 5.2511644570349185;
-__constant double XybLowFreqToVals_lut[21] = {
-	0,
-	1 * XybLowFreqToVals_inc,
-	2 * XybLowFreqToVals_inc,
-	3 * XybLowFreqToVals_inc,
-	4 * XybLowFreqToVals_inc,
-	5 * XybLowFreqToVals_inc,
-	6 * XybLowFreqToVals_inc,
-	7 * XybLowFreqToVals_inc,
-	8 * XybLowFreqToVals_inc,
-	9 * XybLowFreqToVals_inc,
-	10 * XybLowFreqToVals_inc,
-	11 * XybLowFreqToVals_inc,
-	12 * XybLowFreqToVals_inc,
-	13 * XybLowFreqToVals_inc,
-	14 * XybLowFreqToVals_inc,
-	15 * XybLowFreqToVals_inc,
-	16 * XybLowFreqToVals_inc,
-	17 * XybLowFreqToVals_inc,
-	18 * XybLowFreqToVals_inc,
-	19 * XybLowFreqToVals_inc,
-	20 * XybLowFreqToVals_inc,
-};
-
 void XybLowFreqToVals(double x, double y, double z,
 	double *valx, double *valy, double *valz) {
 	const double xmul = 6.64482198135;
 	const double ymul = 0.837846224276;
 	const double zmul = 7.34905756986;
 	const double y_to_z_mul = 0.0812519812628;
+
+	const double XybLowFreqToVals_inc = 5.2511644570349185;
+	const double XybLowFreqToVals_lut[21] = {
+		0,
+		1 * XybLowFreqToVals_inc,
+		2 * XybLowFreqToVals_inc,
+		3 * XybLowFreqToVals_inc,
+		4 * XybLowFreqToVals_inc,
+		5 * XybLowFreqToVals_inc,
+		6 * XybLowFreqToVals_inc,
+		7 * XybLowFreqToVals_inc,
+		8 * XybLowFreqToVals_inc,
+		9 * XybLowFreqToVals_inc,
+		10 * XybLowFreqToVals_inc,
+		11 * XybLowFreqToVals_inc,
+		12 * XybLowFreqToVals_inc,
+		13 * XybLowFreqToVals_inc,
+		14 * XybLowFreqToVals_inc,
+		15 * XybLowFreqToVals_inc,
+		16 * XybLowFreqToVals_inc,
+		17 * XybLowFreqToVals_inc,
+		18 * XybLowFreqToVals_inc,
+		19 * XybLowFreqToVals_inc,
+		20 * XybLowFreqToVals_inc,
+	};
+
 	z += y_to_z_mul * y;
 	*valz = z * zmul;
 	*valx = x * xmul;
@@ -863,32 +864,6 @@ void ButteraugliFFTSquared(double block[kBlockSize]) {
 	}
 }
 
-__constant double MakeHighFreqColorDiffDy_off = 1.4103373714040413;
-__constant double MakeHighFreqColorDiffDy_inc = 0.7084088867024;
-__constant double MakeHighFreqColorDiffDy_lut[21] ={
-	0.0,
-	MakeHighFreqColorDiffDy_off,
-	MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc,
-	MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
-};
-
 double RemoveRangeAroundZero(double v, double range) {
 	if (v >= -range && v < range) {
 		return 0;
@@ -963,6 +938,33 @@ void ButteraugliBlockDiff(double xyb0[3 * kBlockSize],
 	const double ymul2 = 1.51983458269;
 	const double zmul = 2.4;
 
+	const double MakeHighFreqColorDiffDy_off = 1.4103373714040413;
+	const double MakeHighFreqColorDiffDy_inc = 0.7084088867024;
+	const double MakeHighFreqColorDiffDy_lut[21] = {
+		0.0,
+		MakeHighFreqColorDiffDy_off,
+		MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc,
+		MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
+	};
+
+
 	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
 		double d = csf8x8[i];
 		diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i];
@@ -1102,32 +1104,6 @@ __kernel void MaskHighIntensityChange(
 }
 
 
-__constant double XybToVals_off = 11.38708334481672;
-__constant double XybToVals_inc = 14.550189611520716;
-__constant double XybToVals_lut[21] = {
-	0,
-	XybToVals_off,
-	XybToVals_off + 1 * XybToVals_inc,
-	XybToVals_off + 2 * XybToVals_inc,
-	XybToVals_off + 3 * XybToVals_inc,
-	XybToVals_off + 4 * XybToVals_inc,
-	XybToVals_off + 5 * XybToVals_inc,
-	XybToVals_off + 6 * XybToVals_inc,
-	XybToVals_off + 7 * XybToVals_inc,
-	XybToVals_off + 8 * XybToVals_inc,
-	XybToVals_off + 9 * XybToVals_inc,
-	XybToVals_off + 10 * XybToVals_inc,
-	XybToVals_off + 11 * XybToVals_inc,
-	XybToVals_off + 12 * XybToVals_inc,
-	XybToVals_off + 13 * XybToVals_inc,
-	XybToVals_off + 14 * XybToVals_inc,
-	XybToVals_off + 15 * XybToVals_inc,
-	XybToVals_off + 16 * XybToVals_inc,
-	XybToVals_off + 17 * XybToVals_inc,
-	XybToVals_off + 18 * XybToVals_inc,
-	XybToVals_off + 19 * XybToVals_inc,
-};
-
 void XybToVals(
 	double x, double y, double z,
 	double *valx, double *valy, double *valz)
@@ -1136,6 +1112,32 @@ void XybToVals(
     const double ymul = 2.28148649801;
 	const double zmul = 1.87816926918;
 
+	const double XybToVals_off = 11.38708334481672;
+	const double XybToVals_inc = 14.550189611520716;
+	const double XybToVals_lut[21] = {
+		0,
+		XybToVals_off,
+		XybToVals_off + 1 * XybToVals_inc,
+		XybToVals_off + 2 * XybToVals_inc,
+		XybToVals_off + 3 * XybToVals_inc,
+		XybToVals_off + 4 * XybToVals_inc,
+		XybToVals_off + 5 * XybToVals_inc,
+		XybToVals_off + 6 * XybToVals_inc,
+		XybToVals_off + 7 * XybToVals_inc,
+		XybToVals_off + 8 * XybToVals_inc,
+		XybToVals_off + 9 * XybToVals_inc,
+		XybToVals_off + 10 * XybToVals_inc,
+		XybToVals_off + 11 * XybToVals_inc,
+		XybToVals_off + 12 * XybToVals_inc,
+		XybToVals_off + 13 * XybToVals_inc,
+		XybToVals_off + 14 * XybToVals_inc,
+		XybToVals_off + 15 * XybToVals_inc,
+		XybToVals_off + 16 * XybToVals_inc,
+		XybToVals_off + 17 * XybToVals_inc,
+		XybToVals_off + 18 * XybToVals_inc,
+		XybToVals_off + 19 * XybToVals_inc,
+	};
+
 	*valx = Interpolate(&XybToVals_lut[0], 21, x * xmul);
 	*valy = Interpolate(&XybToVals_lut[0], 21, y * ymul);
 	*valz = zmul * z;

From 9400c2130301ca241f50d935867da37ab75dfdea Mon Sep 17 00:00:00 2001
From: ianuming <uming.zelda@gmail.com>
Date: Mon, 8 May 2017 10:22:11 +0800
Subject: [PATCH 043/189] Remove _constant for opencl 2.0

---
 clguetzli/clguetzli.cl | 84 +++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 59f80050..a8af0c0e 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -580,55 +580,15 @@ __kernel void edgeDetectorLowFreq(__global float *result,
 #define kBlockEdgeHalf  (kBlockEdge / 2)
 #define kBlockHalf (kBlockEdge * kBlockEdgeHalf)
 
-__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
-	5.28270670524,
-	0.0,
-	0.0,
-	0.0,
-	0.3831134973,
-	0.676303603859,
-	3.58927792424,
-	18.6104367002,
-	18.6104367002,
-	3.09093131948,
-	1.0,
-	0.498250875965,
-	0.36198671102,
-	0.308982169883,
-	0.1312701920435,
-	2.37370549629,
-	3.58927792424,
-	1.0,
-	2.37370549629,
-	0.991205724152,
-	1.05178802919,
-	0.627264168628,
-	0.4,
-	0.1312701920435,
-	0.676303603859,
-	0.498250875965,
-	0.991205724152,
-	0.5,
-	0.3831134973,
-	0.349686450518,
-	0.627264168628,
-	0.308982169883,
-	0.3831134973,
-	0.36198671102,
-	1.05178802919,
-	0.3831134973,
-	0.12,
-};
-
 typedef struct __Complex
 {
 	double real;
 	double imag;
 }Complex;
 
-constant double kSqrtHalf = 0.70710678118654752440084436210484903;
 
 void RealFFT8(const double* in, Complex* out) {
+	const double kSqrtHalf = 0.70710678118654752440084436210484903;
 	double t1, t2, t3, t5, t6, t7, t8;
 	t8 = in[6];
 	t5 = in[2] - t8;
@@ -743,6 +703,7 @@ inline void FFT4(Complex* a) {
 
 //  D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements.
 void FFT8(Complex* a) {
+	const double kSqrtHalf = 0.70710678118654752440084436210484903;
 	double t1, t2, t3, t4, t5, t6, t7, t8;
 
 	t7 = a[4].imag;
@@ -887,6 +848,47 @@ void ButteraugliBlockDiff(double xyb0[3 * kBlockSize],
 
 	double avgdiff_xyb[3] = { 0.0 };
 	double avgdiff_edge[3][4] = { { 0.0 } };
+
+	const double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
+		5.28270670524,
+		0.0,
+		0.0,
+		0.0,
+		0.3831134973,
+		0.676303603859,
+		3.58927792424,
+		18.6104367002,
+		18.6104367002,
+		3.09093131948,
+		1.0,
+		0.498250875965,
+		0.36198671102,
+		0.308982169883,
+		0.1312701920435,
+		2.37370549629,
+		3.58927792424,
+		1.0,
+		2.37370549629,
+		0.991205724152,
+		1.05178802919,
+		0.627264168628,
+		0.4,
+		0.1312701920435,
+		0.676303603859,
+		0.498250875965,
+		0.991205724152,
+		0.5,
+		0.3831134973,
+		0.349686450518,
+		0.627264168628,
+		0.308982169883,
+		0.3831134973,
+		0.36198671102,
+		1.05178802919,
+		0.3831134973,
+		0.12,
+	};
+
 	for (int i = 0; i < 3 * kBlockSize; ++i) {
 		const double diff_xyb = xyb0[i] - xyb1[i];
 		const int c = i / kBlockSize;

From 6962f20172ae682afb4c641b92009cccaf642162 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 8 May 2017 11:04:57 +0800
Subject: [PATCH 044/189] =?UTF-8?q?=E4=BF=AE=E5=A4=8DnVidia=E6=98=BE?=
 =?UTF-8?q?=E5=8D=A1=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl       | 14 +++++++-------
 clguetzli/clguetzli_test.cpp |  5 +++++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index a8af0c0e..bede6431 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -323,7 +323,7 @@ __kernel void ScaleImage(float scale, __global float *result)
 	result[i] *= scale;
 }
 
-double DotProduct(float u[3], double v[3]) {
+double DotProduct(__global float u[3], double v[3]) {
   return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
 }
 
@@ -359,9 +359,9 @@ __kernel void CombineChannels(
 
 	size_t res_ix = (res_y * res_xsize + res_x) / step;
 	result[res_ix] = (float)(
-		DotProduct((float *)&block_diff_dc[3 * res_ix], dc_mask) +
-		DotProduct((float *)&block_diff_ac[3 * res_ix], mask) +
-		DotProduct((float *)&edge_detector_map[3 * res_ix], mask));
+		DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
+		DotProduct(&block_diff_ac[3 * res_ix], mask) +
+		DotProduct(&edge_detector_map[3 * res_ix], mask));
 }
 
 inline double Interpolate(const double *array, int size, double sx) {
@@ -800,7 +800,7 @@ double abssq(const Complex c) {
 	return c.real * c.real + c.imag * c.imag;
 }
 
-void ButteraugliFFTSquared(double block[kBlockSize]) {
+void ButteraugliFFTSquared(__private double block[kBlockSize]) {
 	double global_mul = 0.000064;
 	Complex block_c[kBlockSize];
 
@@ -840,8 +840,8 @@ double RemoveRangeAroundZero(double v, double range) {
 // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
 // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
 // diff on the edges to diff_xyb_edge_dc.
-void ButteraugliBlockDiff(double xyb0[3 * kBlockSize],
-	double xyb1[3 * kBlockSize],
+void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
+	__private double xyb1[3 * kBlockSize],
 	double diff_xyb_dc[3],
 	double diff_xyb_ac[3],
 	double diff_xyb_edge_dc[3]) {
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index e7410a52..971fa085 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -30,6 +30,7 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2)
 {
+	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -81,6 +82,7 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result)
 {
+	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -123,6 +125,7 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc, const float* result_diff_ac)
 {
+	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -172,6 +175,7 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc)
 {
+	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -216,6 +220,7 @@ void clMask(const float* r, const float* g, const float* b,
 	const float* mask_r, const float* mask_g, const float* mask_b,
 	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b)
 {
+	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);

From c1f83bbb1f7b61786f8f9edfb9e3882153f72a4b Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 8 May 2017 14:56:05 +0800
Subject: [PATCH 045/189] =?UTF-8?q?fixed=20n=E5=8D=A1=20=5F=5Fconstant?=
 =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl       | 170 ++++++++++++++++++-----------------
 clguetzli/clguetzli_test.cpp |   8 +-
 2 files changed, 90 insertions(+), 88 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index bede6431..c742dfd3 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -87,9 +87,7 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int
 	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
 }
 
-void OpsinAbsorbance(const double in[3], double out[3])
-{
-	const float mix[12] = {
+__constant 	float g_mix[12] = {
 	0.348036746003,
 	0.577814843137,
 	0.0544556093735,
@@ -102,11 +100,13 @@ void OpsinAbsorbance(const double in[3], double out[3])
 	0.158581714673,
 	0.712857943858,
 	10.6524069248,
-	};
+};
 
-	out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3];
-	out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7];
-	out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11];
+void OpsinAbsorbance(const double in[3], double out[3])
+{
+	out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3];
+	out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7];
+	out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
 }
 
 double EvaluatePolynomial(const double x, const double *coefficients, int n)
@@ -364,7 +364,7 @@ __kernel void CombineChannels(
 		DotProduct(&edge_detector_map[3 * res_ix], mask));
 }
 
-inline double Interpolate(const double *array, int size, double sx) {
+inline double Interpolate(__constant double *array, int size, double sx) {
 	double ix = fabs(sx);
 
 	int baseix = (int)(ix);
@@ -381,6 +381,31 @@ inline double Interpolate(const double *array, int size, double sx) {
 	return res;
 }
 
+#define XybLowFreqToVals_inc 5.2511644570349185
+__constant double XybLowFreqToVals_lut[21] = {
+	0,
+	1 * XybLowFreqToVals_inc,
+	2 * XybLowFreqToVals_inc,
+	3 * XybLowFreqToVals_inc,
+	4 * XybLowFreqToVals_inc,
+	5 * XybLowFreqToVals_inc,
+	6 * XybLowFreqToVals_inc,
+	7 * XybLowFreqToVals_inc,
+	8 * XybLowFreqToVals_inc,
+	9 * XybLowFreqToVals_inc,
+	10 * XybLowFreqToVals_inc,
+	11 * XybLowFreqToVals_inc,
+	12 * XybLowFreqToVals_inc,
+	13 * XybLowFreqToVals_inc,
+	14 * XybLowFreqToVals_inc,
+	15 * XybLowFreqToVals_inc,
+	16 * XybLowFreqToVals_inc,
+	17 * XybLowFreqToVals_inc,
+	18 * XybLowFreqToVals_inc,
+	19 * XybLowFreqToVals_inc,
+	20 * XybLowFreqToVals_inc,
+};
+
 void XybLowFreqToVals(double x, double y, double z,
 	double *valx, double *valy, double *valz) {
 	const double xmul = 6.64482198135;
@@ -388,31 +413,6 @@ void XybLowFreqToVals(double x, double y, double z,
 	const double zmul = 7.34905756986;
 	const double y_to_z_mul = 0.0812519812628;
 
-	const double XybLowFreqToVals_inc = 5.2511644570349185;
-	const double XybLowFreqToVals_lut[21] = {
-		0,
-		1 * XybLowFreqToVals_inc,
-		2 * XybLowFreqToVals_inc,
-		3 * XybLowFreqToVals_inc,
-		4 * XybLowFreqToVals_inc,
-		5 * XybLowFreqToVals_inc,
-		6 * XybLowFreqToVals_inc,
-		7 * XybLowFreqToVals_inc,
-		8 * XybLowFreqToVals_inc,
-		9 * XybLowFreqToVals_inc,
-		10 * XybLowFreqToVals_inc,
-		11 * XybLowFreqToVals_inc,
-		12 * XybLowFreqToVals_inc,
-		13 * XybLowFreqToVals_inc,
-		14 * XybLowFreqToVals_inc,
-		15 * XybLowFreqToVals_inc,
-		16 * XybLowFreqToVals_inc,
-		17 * XybLowFreqToVals_inc,
-		18 * XybLowFreqToVals_inc,
-		19 * XybLowFreqToVals_inc,
-		20 * XybLowFreqToVals_inc,
-	};
-
 	z += y_to_z_mul * y;
 	*valz = z * zmul;
 	*valx = x * xmul;
@@ -837,6 +837,33 @@ double RemoveRangeAroundZero(double v, double range) {
 	}
 }
 
+#define MakeHighFreqColorDiffDy_off  1.4103373714040413
+#define MakeHighFreqColorDiffDy_inc  0.7084088867024
+__constant double MakeHighFreqColorDiffDy_lut[21] = {
+	0.0,
+	MakeHighFreqColorDiffDy_off,
+	MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc,
+	MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
+};
+
+
 // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
 // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
 // diff on the edges to diff_xyb_edge_dc.
@@ -940,31 +967,6 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
 	const double ymul2 = 1.51983458269;
 	const double zmul = 2.4;
 
-	const double MakeHighFreqColorDiffDy_off = 1.4103373714040413;
-	const double MakeHighFreqColorDiffDy_inc = 0.7084088867024;
-	const double MakeHighFreqColorDiffDy_lut[21] = {
-		0.0,
-		MakeHighFreqColorDiffDy_off,
-		MakeHighFreqColorDiffDy_off + 1 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 2 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 3 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 4 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 5 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 6 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 7 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 8 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 9 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 10 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 11 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 12 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 13 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 14 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 15 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 16 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 17 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 18 * MakeHighFreqColorDiffDy_inc,
-		MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
-	};
 
 
 	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
@@ -1106,6 +1108,32 @@ __kernel void MaskHighIntensityChange(
 }
 
 
+#define XybToVals_off 11.38708334481672
+#define XybToVals_inc 14.550189611520716
+__constant double XybToVals_lut[21] = {
+	0,
+	XybToVals_off,
+	XybToVals_off + 1 * XybToVals_inc,
+	XybToVals_off + 2 * XybToVals_inc,
+	XybToVals_off + 3 * XybToVals_inc,
+	XybToVals_off + 4 * XybToVals_inc,
+	XybToVals_off + 5 * XybToVals_inc,
+	XybToVals_off + 6 * XybToVals_inc,
+	XybToVals_off + 7 * XybToVals_inc,
+	XybToVals_off + 8 * XybToVals_inc,
+	XybToVals_off + 9 * XybToVals_inc,
+	XybToVals_off + 10 * XybToVals_inc,
+	XybToVals_off + 11 * XybToVals_inc,
+	XybToVals_off + 12 * XybToVals_inc,
+	XybToVals_off + 13 * XybToVals_inc,
+	XybToVals_off + 14 * XybToVals_inc,
+	XybToVals_off + 15 * XybToVals_inc,
+	XybToVals_off + 16 * XybToVals_inc,
+	XybToVals_off + 17 * XybToVals_inc,
+	XybToVals_off + 18 * XybToVals_inc,
+	XybToVals_off + 19 * XybToVals_inc,
+};
+
 void XybToVals(
 	double x, double y, double z,
 	double *valx, double *valy, double *valz)
@@ -1114,32 +1142,6 @@ void XybToVals(
     const double ymul = 2.28148649801;
 	const double zmul = 1.87816926918;
 
-	const double XybToVals_off = 11.38708334481672;
-	const double XybToVals_inc = 14.550189611520716;
-	const double XybToVals_lut[21] = {
-		0,
-		XybToVals_off,
-		XybToVals_off + 1 * XybToVals_inc,
-		XybToVals_off + 2 * XybToVals_inc,
-		XybToVals_off + 3 * XybToVals_inc,
-		XybToVals_off + 4 * XybToVals_inc,
-		XybToVals_off + 5 * XybToVals_inc,
-		XybToVals_off + 6 * XybToVals_inc,
-		XybToVals_off + 7 * XybToVals_inc,
-		XybToVals_off + 8 * XybToVals_inc,
-		XybToVals_off + 9 * XybToVals_inc,
-		XybToVals_off + 10 * XybToVals_inc,
-		XybToVals_off + 11 * XybToVals_inc,
-		XybToVals_off + 12 * XybToVals_inc,
-		XybToVals_off + 13 * XybToVals_inc,
-		XybToVals_off + 14 * XybToVals_inc,
-		XybToVals_off + 15 * XybToVals_inc,
-		XybToVals_off + 16 * XybToVals_inc,
-		XybToVals_off + 17 * XybToVals_inc,
-		XybToVals_off + 18 * XybToVals_inc,
-		XybToVals_off + 19 * XybToVals_inc,
-	};
-
 	*valx = Interpolate(&XybToVals_lut[0], 21, x * xmul);
 	*valy = Interpolate(&XybToVals_lut[0], 21, y * ymul);
 	*valz = zmul * z;
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 971fa085..1bb20681 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -283,19 +283,19 @@ void clCalculateDiffmapEx(void)
 
 }
 
-// strong todo
+// chrisk todo
 void clBlur(void)
 {
 
 }
 
-// strong todo
+// chrisk todo
 void clConvolution(void)
 {
 
 }
 
-// strong todo
+// chirsk todo
 void clUpsample(void)
 {
 
@@ -313,7 +313,7 @@ void clAverage5x5(void)
 
 }
 
-// strong todo
+// chrisk todo
 void clMinSquareVal(void)
 {
 

From a8aba9b6b256b57383c0a680edbc8f9bef1ccac9 Mon Sep 17 00:00:00 2001
From: ianuming <uming.zelda@gmail.com>
Date: Mon, 8 May 2017 15:23:44 +0800
Subject: [PATCH 046/189] Fix __constant error for nvidia device

---
 clguetzli/clguetzli.cl | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index c742dfd3..c5284a52 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -587,8 +587,8 @@ typedef struct __Complex
 }Complex;
 
 
+__constant double kSqrtHalf = 0.70710678118654752440084436210484903;
 void RealFFT8(const double* in, Complex* out) {
-	const double kSqrtHalf = 0.70710678118654752440084436210484903;
 	double t1, t2, t3, t5, t6, t7, t8;
 	t8 = in[6];
 	t5 = in[2] - t8;
@@ -863,6 +863,45 @@ __constant double MakeHighFreqColorDiffDy_lut[21] = {
 	MakeHighFreqColorDiffDy_off + 19 * MakeHighFreqColorDiffDy_inc,
 };
 
+__constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
+	5.28270670524,
+	0.0,
+	0.0,
+	0.0,
+	0.3831134973,
+	0.676303603859,
+	3.58927792424,
+	18.6104367002,
+	18.6104367002,
+	3.09093131948,
+	1.0,
+	0.498250875965,
+	0.36198671102,
+	0.308982169883,
+	0.1312701920435,
+	2.37370549629,
+	3.58927792424,
+	1.0,
+	2.37370549629,
+	0.991205724152,
+	1.05178802919,
+	0.627264168628,
+	0.4,
+	0.1312701920435,
+	0.676303603859,
+	0.498250875965,
+	0.991205724152,
+	0.5,
+	0.3831134973,
+	0.349686450518,
+	0.627264168628,
+	0.308982169883,
+	0.3831134973,
+	0.36198671102,
+	1.05178802919,
+	0.3831134973,
+	0.12,
+};
 
 // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
 // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average

From d9e3808417e33d22bcf54aaa6e60418b0bc82ee3 Mon Sep 17 00:00:00 2001
From: ianuming <uming.zelda@gmail.com>
Date: Mon, 8 May 2017 16:36:46 +0800
Subject: [PATCH 047/189] Optimize clDoMask

---
 clguetzli/clguetzli.cl  | 99 ++++++-----------------------------------
 clguetzli/clguetzli.cpp | 90 ++++++++++++++++++++++++++++++++++---
 clguetzli/ocl.h         |  7 +++
 3 files changed, 104 insertions(+), 92 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index c5284a52..1a79c62e 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -192,7 +192,7 @@ __kernel void OpsinDynamicsImage(
 }
 
 
-double InterpolateClampNegative(const double *array,
+double InterpolateClampNegative(__global const double *array,
 	int size, double sx) {
 	if (sx < 0) {
 		sx = 0;
@@ -211,87 +211,11 @@ double InterpolateClampNegative(const double *array,
 	return res;
 }
 
-void MakeMask(double extmul, double extoff,
-	double mul, double offset,
-	double scaler, double *result)
-{
-	for (size_t i = 0; i < 512; ++i) {
-		const double c = mul / ((0.01 * scaler * i) + offset);
-		result[i] = 1.0 + extmul * (c + extoff);
-		result[i] *= result[i];
-	}
-}
-
-double MaskX(double delta) {
-	const double extmul = 0.975741017749;
-	const double extoff = -4.25328244168;
-	const double offset = 0.454909521427;
-	const double scaler = 0.0738288224836;
-	const double mul = 20.8029176447;
-	double lut[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut);
-	return InterpolateClampNegative(lut, 512, delta);
-}
-
-double MaskY(double delta) {
-	const double extmul = 0.373995618954;
-	const double extoff = 1.5307267433;
-	const double offset = 0.911952641929;
-	const double scaler = 1.1731667845;
-	const double mul = 16.2447033988;
-	double lut[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut);
-	return InterpolateClampNegative(lut, 512, delta);
-}
-
-double MaskB(double delta) {
-	const double extmul = 0.61582234137;
-	const double extoff = -4.25376118646;
-	const double offset = 1.05105070921;
-	const double scaler = 0.47434643535;
-	const double mul = 31.1444967089;
-	double lut[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut);
-	return InterpolateClampNegative(lut, 512, delta);
-}
-
-double MaskDcX(double delta) {
-	const double extmul = 1.79116943438;
-	const double extoff = -3.86797479189;
-	const double offset = 0.670960225853;
-	const double scaler = 0.486575865525;
-	const double mul = 20.4563479139;
-	double lut[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut);
-	return InterpolateClampNegative(lut, 512, delta);
-}
-
-double MaskDcY(double delta) {
-	const double extmul = 0.212223514236;
-	const double extoff = -3.65647120524;
-	const double offset = 1.73396799447;
-	const double scaler = 0.170392660501;
-	const double mul = 21.6566724788;
-	double lut[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut);
-	return InterpolateClampNegative(lut, 512, delta);
-}
-
-double MaskDcB(double delta) {
-	const double extmul = 0.349376011816;
-	const double extoff = -0.894711072781;
-	const double offset = 0.901647926679;
-	const double scaler = 0.380086095024;
-	const double mul = 18.0373825149;
-	double lut[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut);
-	return InterpolateClampNegative(lut, 512, delta);
-}
-
 __kernel void DoMask(
 	__global float *mask_x, __global float *mask_y, __global float *mask_b,
 	__global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
-	int xsize, int ysize)
+	__global double *lut_x, __global double *lut_y, __global double *lut_b,
+	__global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b)
 {
 	const double w00 = 232.206464018;
 	const double w11 = 22.9455222245;
@@ -300,6 +224,9 @@ __kernel void DoMask(
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
 	const size_t idx = y * xsize + x;
 	const double s0 = mask_x[idx];
 	const double s1 = mask_y[idx];
@@ -308,16 +235,16 @@ __kernel void DoMask(
 	const double p1 = w11 * s1;
 	const double p2 = w22 * s2;
 
-	mask_x[idx] = (float)(MaskX(p0));
-	mask_y[idx] = (float)(MaskY(p1));
-	mask_b[idx] = (float)(MaskB(p2));
-	mask_dc_x[idx] = (float)(MaskDcX(p0));
-	mask_dc_y[idx] = (float)(MaskDcY(p1));
-	mask_dc_b[idx] = (float)(MaskDcB(p2));
+	mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0));
+	mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1));
+	mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2));
+	mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0));
+	mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1));
+	mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2));
 
 }
 
-__kernel void ScaleImage(float scale, __global float *result)
+__kernel void ScaleImage(double scale, __global float *result)
 {
 	const int i = get_global_id(0);
 	result[i] *= scale;
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 487d260d..bc6e5bb1 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -188,7 +188,7 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b);
 	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize);
 
-	size_t globalWorkSize[1] = { clSize };
+	size_t globalWorkSize[1] = { size };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -468,15 +468,15 @@ void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size
 	}
 }
 
-void clScaleImageEx(cl_mem img/*in, out*/, size_t size, float w)
+void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_float clscale = w;
+	cl_double clscale = w;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_float), (void*)&clscale);
+	clSetKernelArg(kernel, 0, sizeof(cl_double), (void*)&clscale);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
 
 	size_t globalWorkSize[1] = { size };
@@ -587,6 +587,18 @@ void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t s
 	}
 }
 
+
+static void MakeMask(double extmul, double extoff,
+	double mul, double offset,
+	double scaler, double *result)
+{
+	for (size_t i = 0; i < 512; ++i) {
+		const double c = mul / ((0.01 * scaler * i) + offset);
+		result[i] = 1.0 + extmul * (c + extoff);
+		result[i] *= result[i];
+	}
+}
+
 static const double kInternalGoodQualityThreshold = 14.921561160295326;
 static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
 
@@ -598,6 +610,64 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	cl_int clxsize = xsize;
 	cl_int clysize = ysize;
 
+	double extmul = 0.975741017749;
+	double extoff = -4.25328244168;
+	double offset = 0.454909521427;
+	double scaler = 0.0738288224836;
+	double mul = 20.8029176447;
+	double lut_x[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+
+	extmul = 0.373995618954;
+	extoff = 1.5307267433;
+	offset = 0.911952641929;
+	scaler = 1.1731667845;
+	mul = 16.2447033988;
+	double lut_y[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+
+	extmul = 0.61582234137;
+	extoff = -4.25376118646;
+	offset = 1.05105070921;
+	scaler = 0.47434643535;
+	mul = 31.1444967089;
+	double lut_b[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+
+	extmul = 1.79116943438;
+	extoff = -3.86797479189;
+	offset = 0.670960225853;
+	scaler = 0.486575865525;
+	mul = 20.4563479139;
+	double lut_dcx[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+
+	extmul = 0.212223514236;
+	extoff = -3.65647120524;
+	offset = 1.73396799447;
+	scaler = 0.170392660501;
+	mul = 21.6566724788;
+	double lut_dcy[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+
+	extmul = 0.349376011816;
+	extoff = -0.894711072781;
+	offset = 0.901647926679;
+	scaler = 0.380086095024;
+	mul = 18.0373825149;
+	double lut_dcb[512];
+	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+
+	size_t channel_size = 512 * 3 * sizeof(double);
+	ocl_channels xyb = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb_dc = ocl.allocMemChannels(channel_size);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb.x, CL_FALSE, 0, channel_size, lut_x, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb.y, CL_FALSE, 0, channel_size, lut_y, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, lut_b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.x, CL_FALSE, 0, channel_size, lut_dcx, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.y, CL_FALSE, 0, channel_size, lut_dcy, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.b, CL_FALSE, 0, channel_size, lut_dcb, 0, NULL, NULL);
+
 	cl_kernel kernel = ocl.kernel[KERNEL_DOMASK];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g);
@@ -605,8 +675,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r);
 	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g);
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clysize);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb.x);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb.y);
+	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb.b);
+	clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&xyb_dc.x);
+	clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&xyb_dc.y);
+	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&xyb_dc.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -619,8 +693,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	{
 		LogError("Error: clDoMask() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
+
+	ocl.releaseMemChannels(xyb);
+	ocl.releaseMemChannels(xyb_dc);
 }
 
+
 void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize,
 	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 7babc74e..a210d1c1 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -72,6 +72,13 @@ typedef union ocl_channels_t
         cl_mem b;
     };
 
+	struct
+	{
+		cl_mem x;
+		cl_mem y;
+		cl_mem b;
+	};
+
     cl_mem ch[3];
 }ocl_channels;
 

From 77314278f9e966127aba0984bf19b9da128dbfcd Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 8 May 2017 16:54:31 +0800
Subject: [PATCH 048/189] =?UTF-8?q?32=E4=BD=8D=E5=B9=B3=E5=8F=B0=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index e48d1682..d2cf62c7 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -136,7 +136,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Disabled</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
@@ -151,9 +151,10 @@
       <AdditionalDependencies>shlwapi.lib;OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x86</AdditionalLibraryDirectories>
     </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
@@ -177,7 +178,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -188,6 +189,7 @@
       <AdditionalDependencies>shlwapi.lib;OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x86</AdditionalLibraryDirectories>
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>

From 3116d6a216f2a20cf6c90ae40ff71e5ba9a11d76 Mon Sep 17 00:00:00 2001
From: ianuming <uming.zelda@gmail.com>
Date: Mon, 8 May 2017 17:27:20 +0800
Subject: [PATCH 049/189] Move some local constant array to __constant

---
 clguetzli/clguetzli.cl  | 80 ++++++++++-------------------------------
 clguetzli/clguetzli.cpp | 12 +++----
 guetzli.vcxproj         |  2 +-
 3 files changed, 25 insertions(+), 69 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 1a79c62e..564fff1e 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -109,7 +109,7 @@ void OpsinAbsorbance(const double in[3], double out[3])
 	out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
 }
 
-double EvaluatePolynomial(const double x, const double *coefficients, int n)
+double EvaluatePolynomial(const double x, __constant const double *coefficients, int n)
 {
 	double b1 = 0.0;
 	double b2 = 0.0;
@@ -130,25 +130,25 @@ double EvaluatePolynomial(const double x, const double *coefficients, int n)
 	return b1;
 }
 
-double Gamma(double v)
-{
-	double min_value = 0.770000000000000;
-	double max_value = 274.579999999999984;
 
-	const double p[5 + 1] = {
-		881.979476556478289, 1496.058452015812463, 908.662212739659481,
-		373.566100223287378, 85.840860336314364, 6.683258861509244,
-	};
-	const double q[5 + 1] = {
-		12.262350348616792, 20.557285797683576, 12.161463238367844,
-		4.711532733641639, 0.899112889751053, 0.035662329617191,
-	};
+__constant double g_gamma_p[5 + 1] = {
+	881.979476556478289, 1496.058452015812463, 908.662212739659481,
+	373.566100223287378, 85.840860336314364, 6.683258861509244,
+};
+__constant double g_gamma_q[5 + 1] = {
+	12.262350348616792, 20.557285797683576, 12.161463238367844,
+	4.711532733641639, 0.899112889751053, 0.035662329617191,
+};
 
+double Gamma(double v)
+{
+	const double min_value = 0.770000000000000;
+	const double max_value = 274.579999999999984;
 	const double x01 = (v - min_value) / (max_value - min_value);
 	const double xc = 2.0 * x01 - 1.0;
 
-	const double yp = EvaluatePolynomial(xc, p, 6);
-	const double yq = EvaluatePolynomial(xc, q, 6);
+	const double yp = EvaluatePolynomial(xc, g_gamma_p, 6);
+	const double yq = EvaluatePolynomial(xc, g_gamma_q, 6);
 	if (yq == 0.0) return 0.0;
 	return (float)(yp / yq);
 }
@@ -842,46 +842,6 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
 	double avgdiff_xyb[3] = { 0.0 };
 	double avgdiff_edge[3][4] = { { 0.0 } };
 
-	const double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
-		5.28270670524,
-		0.0,
-		0.0,
-		0.0,
-		0.3831134973,
-		0.676303603859,
-		3.58927792424,
-		18.6104367002,
-		18.6104367002,
-		3.09093131948,
-		1.0,
-		0.498250875965,
-		0.36198671102,
-		0.308982169883,
-		0.1312701920435,
-		2.37370549629,
-		3.58927792424,
-		1.0,
-		2.37370549629,
-		0.991205724152,
-		1.05178802919,
-		0.627264168628,
-		0.4,
-		0.1312701920435,
-		0.676303603859,
-		0.498250875965,
-		0.991205724152,
-		0.5,
-		0.3831134973,
-		0.349686450518,
-		0.627264168628,
-		0.308982169883,
-		0.3831134973,
-		0.36198671102,
-		1.05178802919,
-		0.3831134973,
-		0.12,
-	};
-
 	for (int i = 0; i < 3 * kBlockSize; ++i) {
 		const double diff_xyb = xyb0[i] - xyb1[i];
 		const int c = i / kBlockSize;
@@ -933,8 +893,6 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
 	const double ymul2 = 1.51983458269;
 	const double zmul = 2.4;
 
-
-
 	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
 		double d = csf8x8[i];
 		diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i];
@@ -1034,13 +992,11 @@ __kernel void MaskHighIntensityChange(
 	};
 	double sqr_max_diff = -1;
 	{
-		int offset[4] =
-			{ -1, 1, -(int)(xsize), (int)(xsize) };
-		int border[4] =
-			{ x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+		int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+		int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
 		for (int dir = 0; dir < 4; ++dir) {
 			if (border[dir]) {
-			continue;
+				continue;
 			}
 			const int ix2 = ix + offset[dir];
 			double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index bc6e5bb1..e55453d5 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -615,7 +615,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	double offset = 0.454909521427;
 	double scaler = 0.0738288224836;
 	double mul = 20.8029176447;
-	double lut_x[512];
+	static double lut_x[512];
 	MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
 
 	extmul = 0.373995618954;
@@ -623,7 +623,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	offset = 0.911952641929;
 	scaler = 1.1731667845;
 	mul = 16.2447033988;
-	double lut_y[512];
+	static double lut_y[512];
 	MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
 
 	extmul = 0.61582234137;
@@ -631,7 +631,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	offset = 1.05105070921;
 	scaler = 0.47434643535;
 	mul = 31.1444967089;
-	double lut_b[512];
+	static double lut_b[512];
 	MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
 
 	extmul = 1.79116943438;
@@ -639,7 +639,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	offset = 0.670960225853;
 	scaler = 0.486575865525;
 	mul = 20.4563479139;
-	double lut_dcx[512];
+	static double lut_dcx[512];
 	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
 
 	extmul = 0.212223514236;
@@ -647,7 +647,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	offset = 1.73396799447;
 	scaler = 0.170392660501;
 	mul = 21.6566724788;
-	double lut_dcy[512];
+	static double lut_dcy[512];
 	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
 
 	extmul = 0.349376011816;
@@ -655,7 +655,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	offset = 0.901647926679;
 	scaler = 0.380086095024;
 	mul = 18.0373825149;
-	double lut_dcb[512];
+	static double lut_dcb[512];
 	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
 
 	size_t channel_size = 512 * 3 * sizeof(double);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index d2cf62c7..3026fb04 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -137,7 +137,7 @@
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
       <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>Disabled</Optimization>
+      <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>

From 95f10c7ce8f8956f59cd72ad7c4e0c8899da4900 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 8 May 2017 20:47:51 +0800
Subject: [PATCH 050/189] for test

---
 guetzli/butteraugli_comparator.cc |   6 ++
 guetzli/butteraugli_comparator.h  |   4 ++
 guetzli/guetzli.cc                |   5 ++
 guetzli/processor.cc              | 106 ++++++++++++++++++------------
 4 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index 1748b80d..9034d68e 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -22,6 +22,9 @@
 #include "guetzli/gamma_correct.h"
 #include "guetzli/score.h"
 
+int g_switchBlock = 0;
+int g_compareBlock = 0;
+
 namespace guetzli {
 
 ButteraugliComparator::ButteraugliComparator(const int width, const int height,
@@ -94,6 +97,8 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y,
       ::butteraugli::OpsinDynamicsImage(8, 8, per_block_pregamma_[bx]);
     }
   }
+
+  g_switchBlock++;
 }
 
 double ButteraugliComparator::CompareBlock(const OutputImage& img,
@@ -109,6 +114,7 @@ double ButteraugliComparator::CompareBlock(const OutputImage& img,
   std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
   img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c);
   ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
+  g_compareBlock++;
 
   std::vector<std::vector<float> > rgb0 = rgb0_c;
   std::vector<std::vector<float> > rgb1 = rgb1_c;
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index 3879a599..098341e3 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -25,10 +25,14 @@
 #include "guetzli/output_image.h"
 #include "guetzli/stats.h"
 
+extern int g_switchBlock;
+extern int g_compareBlock;
+
 namespace guetzli {
 
 constexpr int kButteraugliStep = 3;
 
+
 class ButteraugliComparator : public Comparator {
  public:
   ButteraugliComparator(const int width, const int height,
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 3355265e..aa328cb1 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -233,6 +233,9 @@ void Usage() {
 
 }  // namespace
 
+extern int g_switchBlock;
+extern int g_compareBlock;
+
 int main(int argc, char** argv) {
   std::set_terminate(TerminateHandler);
 
@@ -330,5 +333,7 @@ int main(int argc, char** argv) {
   }
 
   WriteFileOrDie(argv[opt_idx + 1], out_data);
+
+  fprintf(stderr, "%d %d", g_switchBlock, g_compareBlock);
   return 0;
 }
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 134dfe17..b6057f5e 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -362,63 +362,82 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
 }
 
 
+void func(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
+	const uint8_t comp_mask, guetzli::Params &params_, std::vector<std::pair<int, float> > &input_order)
+{
+	static const uint8_t oldCsf[kDCTBlockSize] = {
+		10, 10, 20, 40, 60, 70, 80, 90,
+		10, 20, 30, 60, 70, 80, 90, 90,
+		20, 30, 60, 70, 80, 90, 90, 90,
+		40, 60, 70, 80, 90, 90, 90, 90,
+		60, 70, 80, 90, 90, 90, 90, 90,
+		70, 80, 90, 90, 90, 90, 90, 90,
+		80, 90, 90, 90, 90, 90, 90, 90,
+		90, 90, 90, 90, 90, 90, 90, 90,
+	};
+	static const double kWeight[3] = { 1.0, 0.22, 0.20 };
+#include "guetzli/order.inc"
+
+	for (int c = 0; c < 3; ++c) { // TOBEREMOVE:��������block��input_order,��0�Ĵ��
+		if (!(comp_mask & (1 << c))) continue;
+		for (int k = 1; k < kDCTBlockSize; ++k) {
+			int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ����������
+			if (block[idx] != 0) {
+				float score;
+				if (params_.new_zeroing_model) {
+					score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
+				}
+				else {
+					score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]);
+				}
+				input_order.push_back(std::make_pair(idx, score));
+			}
+		}
+	}
+	std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
+
+}
+
 // REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<<c)) == 0.
 void Processor::ComputeBlockZeroingOrder(
     const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
     const int block_x, const int block_y, const int factor_x,
     const int factor_y, const uint8_t comp_mask, OutputImage* img,
     std::vector<CoeffData>* output_order) {
-  static const uint8_t oldCsf[kDCTBlockSize] = {
-      10, 10, 20, 40, 60, 70, 80, 90,
-      10, 20, 30, 60, 70, 80, 90, 90,
-      20, 30, 60, 70, 80, 90, 90, 90,
-      40, 60, 70, 80, 90, 90, 90, 90,
-      60, 70, 80, 90, 90, 90, 90, 90,
-      70, 80, 90, 90, 90, 90, 90, 90,
-      80, 90, 90, 90, 90, 90, 90, 90,
-      90, 90, 90, 90, 90, 90, 90, 90,
-  };
-  static const double kWeight[3] = { 1.0, 0.22, 0.20 };
-#include "guetzli/order.inc"
-  std::vector<std::pair<int, float> > input_order;
-  for (int c = 0; c < 3; ++c) { // TOBEREMOVE:��������block��input_order,��0�Ĵ��
-    if (!(comp_mask & (1 << c))) continue;
-    for (int k = 1; k < kDCTBlockSize; ++k) {
-      int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ����������
-      if (block[idx] != 0) {
-        float score;
-        if (params_.new_zeroing_model) {
-          score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
-        } else {
-          score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) *
-                  kWeight[c] / oldCsf[k]);
-        }
-        input_order.push_back(std::make_pair(idx, score));
-      }
+
+	std::vector<std::pair<int, float> > input_order;
+	func(block, orig_block, comp_mask, params_, input_order);
+    if (input_order.size() > 10)
+    {
+        int i = 0;
+        i++;
     }
-  }
-  std::sort(input_order.begin(), input_order.end(),
-            [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
-              return a.second < b.second; });
-  coeff_t processed_block[kBlockSize];
-  memcpy(processed_block, block, sizeof(processed_block));
-  comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
+
+
+	coeff_t processed_block[kBlockSize];
+	memcpy(processed_block, block, sizeof(processed_block));
+
+	comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
+
+
   while (!input_order.empty()) {
     float best_err = 1e17f;
     int best_i = 0;
-    for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
-                                         input_order.size());
-         ++i) {
+    for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead, input_order.size()); ++i)
+    {
       coeff_t candidate_block[kBlockSize];
       memcpy(candidate_block, processed_block, sizeof(candidate_block));
+
       const int idx = input_order[i].first;
+
       candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
+
       for (int c = 0; c < 3; ++c) {
         if (comp_mask & (1 << c)) {
-          img->component(c).SetCoeffBlock(
-              block_x, block_y, &candidate_block[c * kDCTBlockSize]);
+          img->component(c).SetCoeffBlock(block_x, block_y, &candidate_block[c * kDCTBlockSize]);
         }
       }
+
       float max_err = 0;
       for (int iy = 0; iy < factor_y; ++iy) {
         for (int ix = 0; ix < factor_x; ++ix) {
@@ -430,19 +449,21 @@ void Processor::ComputeBlockZeroingOrder(
           }
         }
       }
+
       if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
         best_err = max_err;
         best_i = i;
       }
     }
+
     int idx = input_order[best_i].first;
     processed_block[idx] = 0;
     input_order.erase(input_order.begin() + best_i);
+
     output_order->push_back({idx, best_err}); // TOBEREMOVE:����������������С�����idx����Ӧ���Ա�block�еĶ�Ӧλ����������Ϊ0,�Ƴ�input_order���ѡȡ��ǰֵ������output_order,����ʽ�����õ��Ա�ͼ����ȥ��
     for (int c = 0; c < 3; ++c) {
       if (comp_mask & (1 << c)) {
-        img->component(c).SetCoeffBlock(
-            block_x, block_y, &processed_block[c * kDCTBlockSize]);
+        img->component(c).SetCoeffBlock(block_x, block_y, &processed_block[c * kDCTBlockSize]);
       }
     }
   }
@@ -464,8 +485,7 @@ void Processor::ComputeBlockZeroingOrder(
   // Restore *img to the same state as it was at the start of this function.
   for (int c = 0; c < 3; ++c) {
     if (comp_mask & (1 << c)) {
-      img->component(c).SetCoeffBlock(
-          block_x, block_y, &block[c * kDCTBlockSize]);
+      img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]);
     }
   }
 }

From 1a8fcc2f3e93e10051025c9dbc5c9e5e8696934c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 9 May 2017 15:10:07 +0800
Subject: [PATCH 051/189] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E5=8D=B7=E7=A7=AF?=
 =?UTF-8?q?=E5=87=BD=E6=95=B0=EF=BC=8C=E8=8A=82=E7=9C=81=E4=B8=80=E5=9D=97?=
 =?UTF-8?q?=E4=B8=AD=E9=97=B4=E7=BC=93=E5=AD=98=E7=9A=84=E4=BD=BF=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  |  93 ++++++++++++++++++++++++++
 clguetzli/clguetzli.cpp | 143 ++++++++++++++++++++++++++++++++++++++++
 clguetzli/ocl.h         |   3 +
 3 files changed, 239 insertions(+)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 564fff1e..4a6809be 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -33,6 +33,83 @@ __kernel void MinSquareVal(__global float* pA, __global float* pC, int square_si
 	pC[y * width + x] = minValue;
 }
 
+__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
+	int step, int len, int offset, float border_ratio)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	if (x % step != 0) return;
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	float weight_no_border = 0;
+	for (int j = 0; j <= 2 * offset; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+
+	int minx = x < offset ? 0 : x - offset;
+	int maxx = min(xsize, x + len - offset);
+
+	float weight = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		weight += multipliers[j - x + offset];
+	}
+
+	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+	float scale = 1.0 / weight;
+
+	float sum = 0.0;
+	for (int j = minx; j < maxx; j++)
+	{
+		sum += inp[y * xsize + j] * multipliers[j - x + offset];
+	}
+
+	result[y * xsize + x] = sum * scale;
+}
+
+__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result,
+	int step, int len, int offset, float border_ratio)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	if (x % step != 0) return;
+	if (y % step != 0) return;
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	float weight_no_border = 0;
+	for (int j = 0; j <= 2 * offset; j++)
+	{
+		weight_no_border += multipliers[j];
+	}
+
+	int miny = y < offset ? 0 : y - offset;
+	int maxy = min(ysize, y + len - offset);
+
+	float weight = 0.0;
+	for (int j = miny; j < maxy; j++)
+	{
+		weight += multipliers[j - y + offset];
+	}
+
+	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+	float scale = 1.0 / weight;
+
+	float sum = 0.0;
+	for (int j = miny; j < maxy; j++)
+	{
+		sum += inp[j * xsize + x] * multipliers[j - y + offset];
+	}
+
+	result[y * xsize + x] = sum * scale;
+}
+
 __kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result,
 							int xsize, int xstep, int len, int offset, float border_ratio)
 {
@@ -71,6 +148,22 @@ __kernel void Convolution(__global float* multipliers, __global float* inp, __gl
 	result[ox * ysize + y] = sum * scale;
 }
 
+__kernel void SquareSample(__global float* pA, __global float* pC, int xstep, int ystep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+
+	int x_sample = x - x % xstep;
+	int y_sample = y - y % ystep;
+
+	if (x_sample == x && y_sample == y) return;
+
+	const int xsize = get_global_size(0);
+	const int ysize = get_global_size(1);
+
+	pC[y * xsize + x] = pA[y_sample * xsize + x_sample];
+}
+
 __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep)
 {
 	const int x = get_global_id(0);
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index e55453d5..6b462dd7 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -45,6 +45,9 @@ ocl_args_d_t& getOcl(void)
 	}
 	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err);
 	ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err);
+	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err);
+	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
+	ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "SquareSample", &err);
 	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
 	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
 	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
@@ -102,6 +105,104 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 	}
 }
 
+void clConvolutionX(cl_mem inp, size_t xsize, size_t ysize,
+	cl_mem multipliers, size_t len,
+	int xstep, int offset, double border_ratio,
+	cl_mem result/*out*/)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxstep = xstep;
+	cl_int cllen = len;
+	cl_int cloffset = offset;
+	cl_float clborder_ratio = border_ratio;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
+void clConvolutionY(cl_mem inp, size_t xsize, size_t ysize,
+	cl_mem multipliers, size_t len,
+	int xstep, int offset, double border_ratio,
+	cl_mem result/*out*/)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxstep = xstep;
+	cl_int cllen = len;
+	cl_int cloffset = offset;
+	cl_float clborder_ratio = border_ratio;
+
+	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset);
+	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
+void clUpsampleEx2(cl_mem image, size_t xsize, size_t ysize,
+	size_t xstep, size_t ystep,
+	cl_mem result/*out*/)
+{
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
+
+	cl_int clxstep = xstep;
+	cl_int clystep = ystep;
+	cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
+
+	size_t globalWorkSize[2] = { xsize, ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+	}
+	err = clFinish(ocl.commandQueue);
+	if (CL_SUCCESS != err)
+	{
+		LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err));
+	}
+}
+
 void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
                   size_t xstep, size_t ystep,
                   cl_mem result/*out*/)
@@ -130,10 +231,52 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
 	}
 }
 
+void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
+	double sigma, double border_ratio,
+	cl_mem result/*out, opt*/)
+{
+	double m = 2.25;  // Accuracy increases when m is increased.
+	const double scaler = -1.0 / (2 * sigma * sigma);
+	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+	const int diff = std::max<int>(1, m * fabs(sigma));
+	const int expn_size = 2 * diff + 1;
+	std::vector<float> expn(expn_size);
+	for (int i = -diff; i <= diff; ++i) {
+		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+	}
+
+	const int xstep = std::max<int>(1, int(sigma / 3));
+
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	if (xstep > 1)
+	{
+		ocl.allocA(sizeof(cl_float) * xsize * ysize);
+		clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
+		clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image);
+		clUpsampleEx2(result ? result : image, xsize, ysize, xstep, xstep, result ? result : image);
+	}
+	else
+	{
+		ocl.allocA(sizeof(cl_float) * xsize * ysize);
+		clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
+		clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image);
+	}
+
+	clReleaseMemObject(mem_expn);
+}
 void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
               double sigma, double border_ratio,
               cl_mem result/*out, opt*/)
 {
+	clBlurEx2(image, xsize, ysize, sigma, border_ratio, result);
+
+	return;
 	double m = 2.25;  // Accuracy increases when m is increased.
 	const double scaler = -1.0 / (2 * sigma * sigma);
 	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index a210d1c1..aac82f31 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -46,6 +46,9 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 enum KernelName {
 	KERNEL_MINSQUAREVAL = 0,
 	KERNEL_CONVOLUTION,
+	KERNEL_CONVOLUTIONX,
+	KERNEL_CONVOLUTIONY,
+	KERNEL_SQUARESAMPLE,
 	KERNEL_DOWNSAMPLE,
 	KERNEL_OPSINDYNAMICSIMAGE,
 	KERNEL_DOMASK,

From e919c9b424edcba0a8816db0755652dcf22fdf62 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 9 May 2017 20:10:48 +0800
Subject: [PATCH 052/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

Conflicts:
	clguetzli/clguetzli_test.cpp
	third_party/butteraugli/butteraugli/butteraugli.cc
---
 clguetzli/clguetzli.cl                        | 131 ++++++++++--------
 clguetzli/clguetzli.cpp                       |   6 +-
 clguetzli/clguetzli.h                         |  12 +-
 clguetzli/clguetzli_test.cpp                  |  98 ++++++++++---
 clguetzli/clguetzli_test.h                    |  29 ++++
 .../butteraugli/butteraugli/butteraugli.cc    |  38 +++--
 6 files changed, 229 insertions(+), 85 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 4a6809be..31b87f9d 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -357,14 +357,14 @@ __kernel void CombineChannels(
 	int step,
 	__global float *result)
 {
-	const int res_x = get_global_id(0);
-	const int res_y = get_global_id(1);
+	const int res_x = get_global_id(0) * step;
+	const int res_y = get_global_id(1) * step;
 
-	const int res_xsize = get_global_size(0);
-	const int res_ysize = get_global_size(1);
+	const int res_xsize = (xsize + step - 1) / step;
+	const int res_ysize = (ysize + step - 1) / step;
 
-	if (res_x * step >= xsize - (8 - step)) return;
-	if (res_y * step >= ysize - (8 - step)) return;
+	//if (res_x * step >= xsize - (8 - step)) return;
+	//if (res_y * step >= ysize - (8 - step)) return;
 
 	double mask[3];
 	double dc_mask[3];
@@ -374,8 +374,8 @@ __kernel void CombineChannels(
 	mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
 	dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
 
-	mask[1] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
-	dc_mask[1] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
+	mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
+	dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
 
 	size_t res_ix = (res_y * res_xsize + res_x) / step;
 	result[res_ix] = (float)(
@@ -463,6 +463,62 @@ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
 	res[2] += factor * valz * valz;
 }
 
+void Butteraugli8x8CornerEdgeDetectorDiff(
+    int pos_x,
+    int pos_y,
+    int xsize,
+    int ysize,
+    __global float *r, __global float *g, __global float* b,
+    __global float *r2, __global float* g2, __global float *b2,
+    double* diff_xyb)
+{
+    int local_count = 0;
+    double local_xyb[3] = { 0 };
+    const double w = 0.711100840192;
+
+    int offset[4][2] = { { 0,0 },{ 0,7 },{ 7,0 },{ 7,7 } };
+    int edgeSize = 3;
+
+    for (int k = 0; k < 4; k++)
+    {
+        int x = pos_x + offset[k][0];
+        int y = pos_y + offset[k][1];
+
+        if (x >= edgeSize && x + edgeSize < xsize) {
+            size_t ix = y * xsize + (x - edgeSize);
+            size_t ix2 = ix + 2 * edgeSize;
+            XybDiffLowFreqSquaredAccumulate(
+                w * (r[ix] - r[ix2]),
+                w * (g[ix] - g[ix2]),
+                w * (b[ix] - b[ix2]),
+                w * (r2[ix] - r2[ix2]),
+                w * (g2[ix] - g2[ix2]),
+                w * (b2[ix] - b2[ix2]),
+                1.0, local_xyb);
+            ++local_count;
+        }
+        if (y >= edgeSize && y + edgeSize < ysize) {
+            size_t ix = (y - edgeSize) * xsize + x;
+            size_t ix2 = ix + 2 * edgeSize * xsize;
+            XybDiffLowFreqSquaredAccumulate(
+                w * (r[ix] - r[ix2]),
+                w * (g[ix] - g[ix2]),
+                w * (b[ix] - b[ix2]),
+                w * (r2[ix] - r2[ix2]),
+                w * (g2[ix] - g2[ix2]),
+                w * (b2[ix] - b2[ix2]),
+                1.0, local_xyb);
+            ++local_count;
+        }
+    }
+
+    const double weight = 0.01617112696;
+    const double mul = weight * 8.0 / local_count;
+    for (int i = 0; i < 3; ++i) {
+        diff_xyb[i] += mul * local_xyb[i];
+    }
+}
+
 __kernel void edgeDetectorMap(__global float *result,
 						      __global float *r, __global float *g, __global float* b,
 						      __global float *r2, __global float* g2, __global float *b2,
@@ -483,53 +539,16 @@ __kernel void edgeDetectorMap(__global float *result,
 	pos_x = min(pos_x, xsize - 8);
 	pos_y = min(pos_y, ysize - 8);
 
-	int local_count = 0;
-	double local_xyb[3] = { 0 };
-	const double w = 0.711100840192;
-
-	int offset[4][2] = {{0,0}, {0,7}, {7,0}, {7,7}};
-	int edgeSize = 3;
-
-	for (int k = 0; k < 4; k++)
-	{
-		int x = pos_x + offset[k][0];
-		int y = pos_y + offset[k][1];
-
-		if (x >= edgeSize && x + edgeSize < xsize) {
-			size_t ix = y * xsize + (x - edgeSize);
-			size_t ix2 = ix + 2 * edgeSize;
-			XybDiffLowFreqSquaredAccumulate(
-				w * (r[ix] - r[ix2]),
-				w * (g[ix] - g[ix2]),
-				w * (b[ix] - b[ix2]),
-				w * (r2[ix] - r2[ix2]),
-				w * (g2[ix] - g2[ix2]),
-				w * (b2[ix] - b2[ix2]),
-				1.0, local_xyb);
-			++local_count;
-		}
-		if (y >= edgeSize && y + edgeSize < ysize) {
-			size_t ix = (y - edgeSize) * xsize + x;
-			size_t ix2 = ix + 2 * edgeSize * xsize;
-			XybDiffLowFreqSquaredAccumulate(
-				w * (r[ix] - r[ix2]),
-				w * (g[ix] - g[ix2]),
-				w * (b[ix] - b[ix2]),
-				w * (r2[ix] - r2[ix2]),
-				w * (g2[ix] - g2[ix2]),
-				w * (b2[ix] - b2[ix2]),
-				1.0, local_xyb);
-			++local_count;
-		}
-	}
-
-	const double weight = 0.01617112696;
-	const double mul = weight * 8.0 / local_count;
+    double diff_xyb[3] = { 0.0 };
+    Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize,
+        r, g, b,
+        r2, g2, b2,
+        &diff_xyb[0]);
 
 	int idx = (res_y * res_xsize + res_x) * 3;
-	result[idx]     = local_xyb[0];
-	result[idx + 1] = local_xyb[1];
-	result[idx + 2] = local_xyb[2];
+	result[idx]     = diff_xyb[0];
+	result[idx + 1] = diff_xyb[1];
+	result[idx + 2] = diff_xyb[2];
 }
 
 __kernel void edgeDetectorLowFreq(__global float *result,
@@ -590,9 +609,9 @@ __kernel void edgeDetectorLowFreq(__global float *result,
 
 	const double kMul = 10;
 
-	result[res_ix * 3] = max_diff_xyb[0] * kMul;
-	result[res_ix * 3 + 1] = max_diff_xyb[1] * kMul;
-	result[res_ix * 3 + 2] = max_diff_xyb[2] * kMul;
+	result[res_ix * 3] += max_diff_xyb[0] * kMul;
+	result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul;
+	result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
 }
 
 #define kBlockEdge 8
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 6b462dd7..fe014d70 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -13,7 +13,7 @@ ocl_args_d_t& getOcl(void)
 	if (bInit == true) return ocl;
 
 	bInit = true;
-	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
+	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_CPU);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
@@ -883,8 +883,8 @@ void clCombineChannelsEx(
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
-	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (ysize + step - 1) / step;
+	const size_t res_xsize = ((xsize - 8 + step) + step - 1) / step;
+	const size_t res_ysize = ((ysize - 8 + step) + step - 1) / step;
 
 	cl_int clxsize = xsize;
 	cl_int clysize = ysize;
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index a714bf44..9d6ceba2 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -20,7 +20,7 @@ void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
 
 void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 	size_t xsize, size_t ysize, size_t step,
-	cl_mem block_diff_ac/*out*/);
+	cl_mem block_diff_ac/*in,out*/);
 
 void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
 
@@ -31,3 +31,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize,
 	size_t step,
 	float* result);
+
+void clCombineChannelsEx(
+	ocl_channels mask,
+	ocl_channels mask_dc,
+	cl_mem block_diff_dc,
+	cl_mem block_diff_ac,
+	cl_mem edge_detector_map,
+	size_t xsize, size_t ysize,
+	size_t step,
+	cl_mem result/*out*/);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 1bb20681..c229806e 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -30,7 +30,6 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2)
 {
-	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -82,7 +81,6 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result)
 {
-	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -125,7 +123,6 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc, const float* result_diff_ac)
 {
-	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -173,9 +170,9 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
-	const float* result_diff_dc)
+    const float* orign_ac,
+	const float* result_diff_ac)
 {
-	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -188,7 +185,7 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
 	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
 
-	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
+	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
 
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
@@ -196,22 +193,24 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+    clEnqueueWriteBuffer(ocl.commandQueue, block_diff_ac, CL_FALSE, 0, reschannel_size, orign_ac, 0, NULL, NULL);
+
 	err = clFinish(ocl.commandQueue);
 
-	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc);
+	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
 
-	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
+	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
+	FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
 	ocl.releaseMemChannels(xyb1);
 
-	clReleaseMemObject(block_diff_dc);
+	clReleaseMemObject(block_diff_ac);
 }
 
 void clMask(const float* r, const float* g, const float* b,
@@ -272,21 +271,82 @@ void clMask(const float* r, const float* g, const float* b,
 }
 
 // ian todo
-void clCombineChannels(void)
+void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
+	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
+	const float *block_diff_dc,	const float *block_diff_ac,
+	const float *edge_detector_map,
+	size_t xsize, size_t ysize,
+	size_t res_xsize, size_t res_ysize,
+	size_t step,
+	float *result)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
 
+	size_t channel_size = xsize * ysize * sizeof(float);
+	ocl_channels mask = ocl.allocMemChannels(channel_size);
+	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+	cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
+	cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
+	cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
+	cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float));
+
+	clEnqueueWriteBuffer(ocl.commandQueue, mask.x, CL_FALSE, 0, channel_size, mask_xyb_x, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mask.y, CL_FALSE, 0, channel_size, mask_xyb_y, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mask.b, CL_FALSE, 0, channel_size, mask_xyb_b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.x, CL_FALSE, 0, channel_size, mask_xyb_dc_x, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.y, CL_FALSE, 0, channel_size, mask_xyb_dc_y, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.b, CL_FALSE, 0, channel_size, mask_xyb_dc_b, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_dc, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_dc, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_ac, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_ac, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, cl_edge_detector_map, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), edge_detector_map, 0, NULL, NULL);
+
+	clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, step, cl_result);
+
+	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err);
+
+	FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize);
+
+	ocl.releaseMemChannels(mask);
+	ocl.releaseMemChannels(mask_dc);
+	clReleaseMemObject(cl_block_diff_dc);
+	clReleaseMemObject(cl_block_diff_ac);
+	clReleaseMemObject(cl_edge_detector_map);
+	clReleaseMemObject(cl_result);
 }
 
 // ian todo
-void clCalculateDiffmapEx(void)
+void clCalculateDiffmapEx(const size_t xsize, const size_t ysize,
+	const size_t step,
+	float *diffmap)
 {
 
 }
 
 // chrisk todo
-void clBlur(void)
+void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result)
 {
+    if (xsize < 100 || ysize < 100) return;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem r = ocl.allocMem(channel_size);
+
+    clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, channel_size, channel, 0, NULL, NULL);
+    err = clFinish(ocl.commandQueue);
+
+    clBlurEx(r, xsize, ysize, sigma, border_ratio, r);
 
+    cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+
+    FLOAT_COMPARE(result, r_r, xsize * ysize);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, channel_size, NULL, NULL);
+    err = clFinish(ocl.commandQueue);
+
+    clReleaseMemObject(r);
 }
 
 // chrisk todo
@@ -302,13 +362,17 @@ void clUpsample(void)
 }
 
 // ian todo
-void clDiffPrecompute(void)
+void clDiffPrecompute(
+	const float *xyb0_x, const float *xyb0_y, const float *xyb0_b,
+	const float *xyb1_x, const float *xyb1_y, const float *xyb1_b,
+	size_t xsize, size_t ysize,
+	float *mask_x, float *mask_y, float *mask_b)
 {
 
 }
 
 // ian todo
-void clAverage5x5(void)
+void clAverage5x5(int xsize, int ysize, float *diffs)
 {
 
 }
@@ -320,7 +384,7 @@ void clMinSquareVal(void)
 }
 
 // ian todo
-void clScaleImage(void)
+void clScaleImage(double scale, float *result)
 {
 
 }
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 3b62144e..71500eff 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -9,6 +9,8 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2);
 
+void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result);
+
 void clEdgeDetectorMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
@@ -22,6 +24,7 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
+    const float* orign_ac,
 	const float* result_diff_dc);
 
 void clMask(const float* r, const float* g, const float* b,
@@ -29,3 +32,29 @@ void clMask(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize,
 	const float* mask_r, const float* mask_g, const float* mask_b,
 	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b);
+
+void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
+	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
+	const float *block_diff_dc, const float *block_diff_ac,
+	const float *edge_detector_map,
+	size_t xsize, size_t ysize,
+	size_t res_xsize, size_t res_ysize,
+	size_t step,
+	float *result);
+
+void clCalculateDiffmapEx(const size_t xsize, const size_t ysize,
+	const size_t step,
+	float *diffmap);
+
+void clBlur(size_t xsize, size_t ysize, float* channel, double sigma,
+	double border_ratio);
+
+void clDiffPrecompute(
+	const float *xyb0_x, const float *xyb0_y, const float *xyb0_b,
+	const float *xyb1_x, const float *xyb1_y, const float *xyb1_b,
+	size_t xsize, size_t ysize,
+	float *mask_x, float *mask_y, float *mask_b);
+
+void clAverage5x5(int xsize, int ysize, float *diffs);
+
+void clScaleImage(double scale, float *result);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 138234ad..8869b518 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -98,6 +98,9 @@ static void Convolution(size_t xsize, size_t ysize,
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
 
+    std::vector<float> orignChannel(xsize * ysize);
+    memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
+
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
   const double scaler = -1.0 / (2 * sigma * sigma);
@@ -133,6 +136,8 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
       }
     }
   }
+
+  clBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
 }
 
 // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
@@ -834,7 +839,7 @@ void MaskHighIntensityChange(
 	  c1[0].data(), c1[1].data(), c1[2].data(),
 	  xsize, ysize,
 	  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb0[0].data(), xyb1[1].data(), xyb1[2].data());
+	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
 }
 
 double SimpleGamma(double v) {
@@ -1062,6 +1067,8 @@ static void ScaleImage(double scale, std::vector<float> *result) {
   for (size_t i = 0; i < result->size(); ++i) {
     (*result)[i] *= static_cast<float>(scale);
   }
+
+  clScaleImage(scale, (*result).data());
 }
 
 // Making a cluster of local errors to be more impactful than
@@ -1121,6 +1128,7 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize,
     }
     ScaleImage(scale, diffmap);
   }
+  clCalculateDiffmapEx(xsize, ysize, step, (*diffmap).data());
 }
 
 void ButteraugliComparator::DiffmapOpsinDynamicsImage(
@@ -1238,15 +1246,18 @@ void ButteraugliComparator::EdgeDetectorMap(
   }
 
   clEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	  xsize_, ysize_, step_,
-	  (*edge_detector_map).data());
+	                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	                xsize_, ysize_, step_,
+	                (*edge_detector_map).data());
 }
 
 void ButteraugliComparator::EdgeDetectorLowFreq(
     const std::vector<std::vector<float> > &xyb0,
     const std::vector<std::vector<float> > &xyb1,
     std::vector<float>* block_diff_ac) {
+
+    std::vector<float> orign_ac = *block_diff_ac;
+
   PROFILER_FUNC;
   static const double kSigma = 14;
   static const double kMul = 10;
@@ -1299,9 +1310,9 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
   }
 
   clEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	  xsize_, ysize_, step_,
-	  (*block_diff_ac).data());
+	                    xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	                    xsize_, ysize_, step_,
+                        orign_ac.data(), (*block_diff_ac).data());
 }
 
 void ButteraugliComparator::CombineChannels(
@@ -1314,7 +1325,7 @@ void ButteraugliComparator::CombineChannels(
   PROFILER_FUNC;
   result->resize(res_xsize_ * res_ysize_);
   for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
-    for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) {
+    for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) {
       size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
       double mask[3];
       double dc_mask[3];
@@ -1328,6 +1339,10 @@ void ButteraugliComparator::CombineChannels(
            DotProduct(&edge_detector_map[3 * res_ix], mask));
     }
   }
+  clCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
+	  mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
+	  block_diff_dc.data(),
+	  block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data());
 }
 
 double ButteraugliScoreFromDiffmap(const std::vector<float>& diffmap) {
@@ -1522,6 +1537,8 @@ void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   }
   *diffs = result;
   ScaleImage(scale, diffs);
+
+  clAverage5x5(xsize, ysize, (*diffs).data());
 }
 
 void DiffPrecompute(
@@ -1577,6 +1594,11 @@ void DiffPrecompute(
       }
     }
   }
+  clDiffPrecompute(
+	  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+	  xsize, ysize,
+	  ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data());
 }
 
 void Mask(const std::vector<std::vector<float> > &xyb0,

From f947da99d3e62e3745c00391c5db2da0012e8212 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 10 May 2017 02:09:14 +0800
Subject: [PATCH 053/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3blockDiffMap=E8=AE=A1?=
 =?UTF-8?q?=E7=AE=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 31b87f9d..358e931b 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1042,11 +1042,11 @@ __kernel void blockDiffMap(__global float* r, __global float* g, __global float*
 	int pos_x = res_x * step;
 	int pos_y = res_y * step;
 
-	if ((pos_x + kBlockEdge - step - 1) >= ysize) return;
-	if ((pos_y + kBlockEdge - step - 1) >= xsize) return;
+	if ((pos_x + kBlockEdge - step - 1) >= xsize) return;
+	if ((pos_y + kBlockEdge - step - 1) >= ysize) return;
 
 	size_t res_ix = res_y * res_xsize + res_x;
-	size_t offset = min(res_y * step, ysize - 8) * xsize + min(res_x * step, xsize - 8);
+	size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8);
 
 	double block0[3 * kBlockEdge * kBlockEdge];
 	double block1[3 * kBlockEdge * kBlockEdge];

From 6ba5810988f6544fa24f01d606455e31db5036b3 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Wed, 10 May 2017 14:21:13 +0800
Subject: [PATCH 054/189] Merge remote-tracking branch 'origin/master'

---
 .gitignore                                    |   1 +
 clguetzli/clguetzli.cpp                       |  34 ++--
 clguetzli/clguetzli.h                         |   6 +
 clguetzli/clguetzli_test.cpp                  |  72 +++++++--
 clguetzli/clguetzli_test.h                    |  35 ++--
 guetzli/guetzli.cc                            |   3 +
 .../butteraugli/butteraugli/butteraugli.cc    | 149 ++++++++++++------
 7 files changed, 211 insertions(+), 89 deletions(-)

diff --git a/.gitignore b/.gitignore
index dd10da52..3d270281 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ ipch/
 *.cachefile
 *.VC.db
 *.VC.VC.opendb
+guetzli.vcxproj.user
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index fe014d70..2fb1262d 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -4,6 +4,7 @@
 #include "clguetzli.h"
 
 extern bool g_useOpenCL = false;
+extern bool g_checkOpenCL = false;
 
 ocl_args_d_t& getOcl(void)
 {
@@ -74,7 +75,7 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
-	size_t oxsize = xsize / xstep;
+	size_t oxsize = (xsize + xstep - 1) / xstep;
 
 	cl_int clxsize = xsize;
 	cl_int clxstep = xstep;
@@ -318,10 +319,21 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 	clReleaseMemObject(mem_expn);
 }
 
-void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred, size_t size)
+void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize)
 {
+	static const double kSigma = 1.1;
+
+	cl_int channel_size = xsize * ysize * sizeof(float);
+
+	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_int clSize = size;
+	ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size);
+
+	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+
+	cl_int clSize = xsize * ysize;
 	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g);
@@ -331,8 +343,8 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b);
 	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize);
 
-	size_t globalWorkSize[1] = { size };
-	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	size_t globalWorkSize[1] = { xsize * ysize };
+	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clOpsinDynamicsImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
@@ -342,29 +354,24 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, ocl_channels rgb_blurred
 	{
 		LogError("Error: clOpsinDynamicsImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
+
+	ocl.releaseMemChannels(rgb_blurred);
 }
 
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b)
 {
-	static const double kSigma = 1.1;
-
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
     ocl_channels rgb = ocl.allocMemChannels(channel_size);
-	ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size);
 
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
-	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
-	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
-
-	clOpsinDynamicsImageEx(rgb, rgb_blurred, xsize * ysize);
+	clOpsinDynamicsImageEx(rgb, xsize, ysize);
 
 	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
@@ -382,7 +389,6 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
-	ocl.releaseMemChannels(rgb_blurred);
 }
 
 void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 9d6ceba2..21ec7237 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -3,6 +3,7 @@
 #include "ocl.h"
 
 extern bool g_useOpenCL;
+extern bool g_checkOpenCL;
 
 void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
 	ocl_channels xyb1/*in,out*/,
@@ -41,3 +42,8 @@ void clCombineChannelsEx(
 	size_t xsize, size_t ysize,
 	size_t step,
 	cl_mem result/*out*/);
+
+void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
+	cl_mem multipliers, size_t len,
+	int xstep, int offset, double border_ratio,
+	cl_mem result/*out*/);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index c229806e..bac52b60 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -24,7 +24,7 @@ int floatCompare(const float* a, const float* b, size_t size, const char* szFunc
 	return count;
 }
 
-void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
+void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize,
 	const float* result_r, const float* result_g, const float* result_b,
@@ -76,7 +76,7 @@ void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
 }
 
 // strong to
-void clEdgeDetectorMap(const float* r, const float* g, const float* b,
+void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result)
@@ -118,7 +118,7 @@ void clEdgeDetectorMap(const float* r, const float* g, const float* b,
 }
 
 // strong todo
-void clBlockDiffMap(const float* r, const float* g, const float* b,
+void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc, const float* result_diff_ac)
@@ -167,7 +167,7 @@ void clBlockDiffMap(const float* r, const float* g, const float* b,
 }
 
 // strong to
-void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
+void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
     const float* orign_ac,
@@ -213,13 +213,12 @@ void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	clReleaseMemObject(block_diff_ac);
 }
 
-void clMask(const float* r, const float* g, const float* b,
+void tclMask(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize,
 	const float* mask_r, const float* mask_g, const float* mask_b,
 	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b)
 {
-	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	size_t channel_size = xsize * ysize * sizeof(float);
@@ -271,7 +270,7 @@ void clMask(const float* r, const float* g, const float* b,
 }
 
 // ian todo
-void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
+void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
 	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
 	const float *block_diff_dc,	const float *block_diff_ac,
 	const float *edge_detector_map,
@@ -316,7 +315,7 @@ void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const f
 }
 
 // ian todo
-void clCalculateDiffmapEx(const size_t xsize, const size_t ysize,
+void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
 	float *diffmap)
 {
@@ -324,7 +323,7 @@ void clCalculateDiffmapEx(const size_t xsize, const size_t ysize,
 }
 
 // chrisk todo
-void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result)
+void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result)
 {
     if (xsize < 100 || ysize < 100) return;
 
@@ -350,19 +349,55 @@ void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double bor
 }
 
 // chrisk todo
-void clConvolution(void)
+void tclConvolution(float* result, size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* multipliers,
+	const float* inp,
+	float border_ratio,
+	float* orign_result)
 {
+	return;
+	if (xsize < 100 || ysize < 100) return;
+
+	int dxsize = (xsize + xstep - 1) / xstep;
+	size_t result_size = dxsize * ysize * sizeof(float);
+	size_t inp_size = xsize * ysize * sizeof(float);
+	size_t multipliers_size = len * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem r = ocl.allocMem(result_size);
+	cl_mem i = ocl.allocMem(inp_size);
+	cl_mem m = ocl.allocMem(len);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, result_size, result, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, m, CL_FALSE, 0, multipliers_size, multipliers, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clConvolutionEx(i, xsize, ysize, m, len, xstep, offset, border_ratio, r);
 
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(orign_result, r_r, dxsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clReleaseMemObject(r);
+	clReleaseMemObject(i);
+	clReleaseMemObject(m);
 }
 
 // chirsk todo
-void clUpsample(void)
+void tclUpsample(void)
 {
 
 }
 
 // ian todo
-void clDiffPrecompute(
+void tclDiffPrecompute(
 	const float *xyb0_x, const float *xyb0_y, const float *xyb0_b,
 	const float *xyb1_x, const float *xyb1_y, const float *xyb1_b,
 	size_t xsize, size_t ysize,
@@ -372,19 +407,26 @@ void clDiffPrecompute(
 }
 
 // ian todo
-void clAverage5x5(int xsize, int ysize, float *diffs)
+void tclAverage5x5(int xsize, int ysize, float *diffs)
 {
 
 }
 
 // chrisk todo
-void clMinSquareVal(void)
+void tclMinSquareVal(void)
 {
 
 }
 
 // ian todo
-void clScaleImage(double scale, float *result)
+void tclScaleImage(double scale, float *result)
 {
 
 }
+
+// strong todo
+void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
+	float* result_r, float* result_g, float* result_b)
+{
+
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 71500eff..f57c16e0 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -3,37 +3,37 @@
 
 ocl_args_d_t& getOcl(void);
 
-void clMaskHighIntensityChange(const float* r, const float* g, const float* b,
+void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2);
 
-void clBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result);
+void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result);
 
-void clEdgeDetectorMap(const float* r, const float* g, const float* b,
+void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result);
 
-void clBlockDiffMap(const float* r, const float* g, const float* b,
+void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc, const float* result_diff_ac);
 
-void clEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
+void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
     const float* orign_ac,
 	const float* result_diff_dc);
 
-void clMask(const float* r, const float* g, const float* b,
+void tclMask(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize,
 	const float* mask_r, const float* mask_g, const float* mask_b,
 	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b);
 
-void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
+void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
 	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
 	const float *block_diff_dc, const float *block_diff_ac,
 	const float *edge_detector_map,
@@ -42,19 +42,30 @@ void clCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const f
 	size_t step,
 	float *result);
 
-void clCalculateDiffmapEx(const size_t xsize, const size_t ysize,
+void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
 	float *diffmap);
 
-void clBlur(size_t xsize, size_t ysize, float* channel, double sigma,
+void tclConvolution(float* result, size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* multipliers,
+	const float* inp,
+	float border_ratio,
+	float* orign_result);
+
+void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma,
 	double border_ratio);
 
-void clDiffPrecompute(
+void tclDiffPrecompute(
 	const float *xyb0_x, const float *xyb0_y, const float *xyb0_b,
 	const float *xyb1_x, const float *xyb1_y, const float *xyb1_b,
 	size_t xsize, size_t ysize,
 	float *mask_x, float *mask_y, float *mask_b);
 
-void clAverage5x5(int xsize, int ysize, float *diffs);
+void tclAverage5x5(int xsize, int ysize, float *diffs);
+
+void tclScaleImage(double scale, float *result);
 
-void clScaleImage(double scale, float *result);
+void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
+	float* result_r, float* result_g, float* result_b);
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index aa328cb1..32103a74 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -265,6 +265,9 @@ int main(int argc, char** argv) {
 	else if (!strcmp(argv[opt_idx], "--opencl")) {
 		g_useOpenCL = true;
 	}
+	else if (!strcmp(argv[opt_idx], "--checkcl")) {
+		g_checkOpenCL = true;
+	}
 	else if (!strcmp(argv[opt_idx], "--")) {
       opt_idx++;
       break;
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 8869b518..cce14dad 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -69,6 +69,11 @@ static void Convolution(size_t xsize, size_t ysize,
 	const float* __restrict__ inp,
 	float border_ratio,
 	float* __restrict__ result) {
+
+	int dxsize = (xsize + xstep - 1) / xstep;
+	std::vector<float> newResult(dxsize * ysize);
+	memcpy(newResult.data(), result, dxsize * ysize * sizeof(float));
+
   PROFILER_FUNC;
   float weight_no_border = 0;
 
@@ -93,13 +98,19 @@ static void Convolution(size_t xsize, size_t ysize,
       result[ox * ysize + y] = static_cast<float>(sum * scale);
     }
   }
+
+  tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
 }
 
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
 
-    std::vector<float> orignChannel(xsize * ysize);
-    memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
+    std::vector<float> orignChannel;
+	if (g_checkOpenCL)
+	{
+		orignChannel.resize(xsize * ysize);
+		memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
+	}
 
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
@@ -137,7 +148,10 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
     }
   }
 
-  clBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
+  if (g_checkOpenCL)
+  {
+	  tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
+  }
 }
 
 // To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
@@ -835,11 +849,14 @@ void MaskHighIntensityChange(
     }
   }
 
-  clMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
-	  c1[0].data(), c1[1].data(), c1[2].data(),
-	  xsize, ysize,
-	  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
+  if (g_checkOpenCL)
+  {
+	  tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
+		  c1[0].data(), c1[1].data(), c1[2].data(),
+		  xsize, ysize,
+		  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
+  }
 }
 
 double SimpleGamma(double v) {
@@ -1021,6 +1038,12 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
         return;
     }
 
+	std::vector< std::vector<float>> orig_rgb;
+	if (g_checkOpenCL)
+	{
+		orig_rgb = rgb;
+	}
+
   PROFILER_FUNC;
   std::vector<std::vector<float> > blurred = rgb;
   static const double kSigma = 1.1;
@@ -1037,16 +1060,6 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
       sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
       sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
       sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
-
-#ifdef ENABLE_OPENCL_CHECK
-	  double sensitivity_new[3];
-	  sensitivity_new[0] = GammaNonRecursion(pre_mixed[0]) / pre_mixed[0];
-	  assert(fabs(sensitivity[0] - sensitivity_new[0]) < 0.01);
-	  sensitivity_new[1] = GammaNonRecursion(pre_mixed[1]) / pre_mixed[1];
-	  assert(fabs(sensitivity[1] - sensitivity_new[1]) < 0.01);
-	  sensitivity_new[2] = GammaNonRecursion(pre_mixed[2]) / pre_mixed[2];
-	  assert(fabs(sensitivity[2] - sensitivity_new[2]) < 0.01);
-#endif // ENABLE_OPENCL_CHECK
     }
     double cur_rgb[3] = { rgb[0][i],  rgb[1][i],  rgb[2][i] };
     double cur_mixed[3];
@@ -1060,6 +1073,12 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
     rgb[1][i] = static_cast<float>(y);
     rgb[2][i] = static_cast<float>(z);
   }
+
+  if (g_checkOpenCL)
+  {
+	  tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize,
+		  rgb[0].data(), rgb[1].data(), rgb[2].data());
+  }
 }
 
 static void ScaleImage(double scale, std::vector<float> *result) {
@@ -1068,7 +1087,10 @@ static void ScaleImage(double scale, std::vector<float> *result) {
     (*result)[i] *= static_cast<float>(scale);
   }
 
-  clScaleImage(scale, (*result).data());
+  if (g_checkOpenCL)
+  {
+	  tclScaleImage(scale, (*result).data());
+  }
 }
 
 // Making a cluster of local errors to be more impactful than
@@ -1128,7 +1150,10 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize,
     }
     ScaleImage(scale, diffmap);
   }
-  clCalculateDiffmapEx(xsize, ysize, step, (*diffmap).data());
+  if (g_checkOpenCL)
+  {
+	  tclCalculateDiffmap(xsize, ysize, step, (*diffmap).data());
+  }
 }
 
 void ButteraugliComparator::DiffmapOpsinDynamicsImage(
@@ -1208,10 +1233,14 @@ void ButteraugliComparator::BlockDiffMap(
       }
     }
   }
-  clBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	  xsize_, ysize_, step_,
-	  (*block_diff_dc).data(), (*block_diff_ac).data());
+
+  if (g_checkOpenCL)
+  {
+	  tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+		  xsize_, ysize_, step_,
+		  (*block_diff_dc).data(), (*block_diff_ac).data());
+  }
 }
 
 void ButteraugliComparator::EdgeDetectorMap(
@@ -1245,10 +1274,13 @@ void ButteraugliComparator::EdgeDetectorMap(
     }
   }
 
-  clEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	                xsize_, ysize_, step_,
-	                (*edge_detector_map).data());
+  if (g_checkOpenCL)
+  {
+	  tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+		  xsize_, ysize_, step_,
+		  (*edge_detector_map).data());
+  }
 }
 
 void ButteraugliComparator::EdgeDetectorLowFreq(
@@ -1256,7 +1288,11 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
     const std::vector<std::vector<float> > &xyb1,
     std::vector<float>* block_diff_ac) {
 
-    std::vector<float> orign_ac = *block_diff_ac;
+	std::vector<float> orign_ac;
+	if (g_checkOpenCL)
+	{
+		orign_ac = *block_diff_ac;
+	}
 
   PROFILER_FUNC;
   static const double kSigma = 14;
@@ -1309,10 +1345,13 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
     }
   }
 
-  clEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	                    xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	                    xsize_, ysize_, step_,
-                        orign_ac.data(), (*block_diff_ac).data());
+  if (g_checkOpenCL)
+  {
+	  tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+		  xsize_, ysize_, step_,
+		  orign_ac.data(), (*block_diff_ac).data());
+  }
 }
 
 void ButteraugliComparator::CombineChannels(
@@ -1339,10 +1378,14 @@ void ButteraugliComparator::CombineChannels(
            DotProduct(&edge_detector_map[3 * res_ix], mask));
     }
   }
-  clCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
-	  mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
-	  block_diff_dc.data(),
-	  block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data());
+
+  if (g_checkOpenCL)
+  {
+	  tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
+		  mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
+		  block_diff_dc.data(),
+		  block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data());
+  }
 }
 
 double ButteraugliScoreFromDiffmap(const std::vector<float>& diffmap) {
@@ -1538,7 +1581,10 @@ void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   *diffs = result;
   ScaleImage(scale, diffs);
 
-  clAverage5x5(xsize, ysize, (*diffs).data());
+  if (g_checkOpenCL)
+  {
+	  tclAverage5x5(xsize, ysize, (*diffs).data());
+  }
 }
 
 void DiffPrecompute(
@@ -1594,11 +1640,15 @@ void DiffPrecompute(
       }
     }
   }
-  clDiffPrecompute(
-	  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	  xsize, ysize,
-	  ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data());
+
+  if (g_checkOpenCL)
+  {
+	  tclDiffPrecompute(
+		  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+		  xsize, ysize,
+		  ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data());
+  }
 }
 
 void Mask(const std::vector<std::vector<float> > &xyb0,
@@ -1653,11 +1703,14 @@ void Mask(const std::vector<std::vector<float> > &xyb0,
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
   }
 
-  clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-	  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-	  xsize, ysize,
-	  (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
-	  (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+  if (g_checkOpenCL)
+  {
+	  tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+		  xsize, ysize,
+		  (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+		  (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+  }
 }
 
 }  // namespace butteraugli

From 853222f43a805e99a936158ef69d690fe6e5ac41 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Wed, 10 May 2017 14:54:56 +0800
Subject: [PATCH 055/189] add clMinSquareVal test

---
 clguetzli/clguetzli.h                         |  2 ++
 clguetzli/clguetzli_test.cpp                  | 25 +++++++++++++++--
 clguetzli/clguetzli_test.h                    |  4 +++
 .../butteraugli/butteraugli/butteraugli.cc    | 27 ++++++++++++++++---
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 21ec7237..3c05aeb9 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -47,3 +47,5 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 	cl_mem multipliers, size_t len,
 	int xstep, int offset, double border_ratio,
 	cl_mem result/*out*/);
+
+void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index bac52b60..25156a59 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -357,7 +357,6 @@ void tclConvolution(float* result, size_t xsize, size_t ysize,
 	float border_ratio,
 	float* orign_result)
 {
-	return;
 	if (xsize < 100 || ysize < 100) return;
 
 	int dxsize = (xsize + xstep - 1) / xstep;
@@ -413,9 +412,31 @@ void tclAverage5x5(int xsize, int ysize, float *diffs)
 }
 
 // chrisk todo
-void tclMinSquareVal(void)
+void tclMinSquareVal(float *img, size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values)
 {
+	if (xsize < 100 || ysize < 100) return;
+
+	size_t img_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem r = ocl.allocMem(img_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, img_size, img, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
+	clMinSquareValEx(r, xsize, ysize, square_size, offset);
+
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, img_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(values, r_r, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, img_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clReleaseMemObject(r);
 }
 
 // ian todo
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index f57c16e0..4e94c490 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -69,3 +69,7 @@ void tclScaleImage(double scale, float *result);
 
 void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
 	float* result_r, float* result_g, float* result_b);
+
+void tclMinSquareVal(float *img, size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index cce14dad..9168db05 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -70,9 +70,13 @@ static void Convolution(size_t xsize, size_t ysize,
 	float border_ratio,
 	float* __restrict__ result) {
 
-	int dxsize = (xsize + xstep - 1) / xstep;
-	std::vector<float> newResult(dxsize * ysize);
-	memcpy(newResult.data(), result, dxsize * ysize * sizeof(float));
+	std::vector<float> newResult;
+	if (g_checkOpenCL)
+	{
+		int dxsize = (xsize + xstep - 1) / xstep;
+		newResult.resize(dxsize * ysize);
+		memcpy(newResult.data(), result, dxsize * ysize * sizeof(float));
+	}
 
   PROFILER_FUNC;
   float weight_no_border = 0;
@@ -99,7 +103,10 @@ static void Convolution(size_t xsize, size_t ysize,
     }
   }
 
-  tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+  if (g_checkOpenCL)
+  {
+	  tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+  }
 }
 
 void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
@@ -1490,6 +1497,13 @@ void MinSquareVal(size_t square_size, size_t offset,
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
 
+  std::vector<float> img;
+  if (g_checkOpenCL)
+  {
+	  img.resize(xsize * ysize);
+	  memcpy(img.data(), values, xsize * ysize * sizeof(float));
+  }
+
   for (size_t y = 0; y < ysize; ++y) {
     const size_t minh = offset > y ? 0 : y - offset;
     const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
@@ -1526,6 +1540,11 @@ void MinSquareVal(size_t square_size, size_t offset,
         *pValuePoint = min; pValuePoint += xsize;
     }
   }
+
+  if (g_checkOpenCL)
+  {
+	  tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
+  }
 }
 
 // ===== Functions used by Mask only =====

From 920de33da90a17a0e6fb9e0487b675758a7fabc1 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 11 May 2017 01:05:03 +0800
Subject: [PATCH 056/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clguetzli.cl                        |  8 +--
 clguetzli/clguetzli.cpp                       | 29 ++++++-----
 clguetzli/clguetzli.h                         |  6 ++-
 clguetzli/clguetzli_test.cpp                  | 50 ++++++++++++-------
 clguetzli/clguetzli_test.h                    |  1 +
 .../butteraugli/butteraugli/butteraugli.cc    | 12 ++++-
 6 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 358e931b..b2f58c66 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -354,18 +354,13 @@ __kernel void CombineChannels(
 	__global float *block_diff_ac,
 	__global float *edge_detector_map,
 	int xsize, int ysize,
+	int res_xsize,
 	int step,
 	__global float *result)
 {
 	const int res_x = get_global_id(0) * step;
 	const int res_y = get_global_id(1) * step;
 
-	const int res_xsize = (xsize + step - 1) / step;
-	const int res_ysize = (ysize + step - 1) / step;
-
-	//if (res_x * step >= xsize - (8 - step)) return;
-	//if (res_y * step >= ysize - (8 - step)) return;
-
 	double mask[3];
 	double dc_mask[3];
 	mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
@@ -382,6 +377,7 @@ __kernel void CombineChannels(
 		DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
 		DotProduct(&block_diff_ac[3 * res_ix], mask) +
 		DotProduct(&edge_detector_map[3 * res_ix], mask));
+	//result[res_ix] = 1;
 }
 
 inline double Interpolate(__constant double *array, int size, double sx) {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 2fb1262d..542c7858 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -14,7 +14,7 @@ ocl_args_d_t& getOcl(void)
 	if (bInit == true) return ocl;
 
 	bInit = true;
-	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_CPU);
+	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
@@ -883,15 +883,17 @@ void clCombineChannelsEx(
 	cl_mem block_diff_ac,
 	cl_mem edge_detector_map,
 	size_t xsize, size_t ysize,
+	size_t res_xsize,
 	size_t step,
 	cl_mem result/*out*/)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
-	const size_t res_xsize = ((xsize - 8 + step) + step - 1) / step;
-	const size_t res_ysize = ((ysize - 8 + step) + step - 1) / step;
+	const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
+	const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
 
+	cl_int clres_size = res_xsize;
 	cl_int clxsize = xsize;
 	cl_int clysize = ysize;
 	cl_int clstep = step;
@@ -908,10 +910,11 @@ void clCombineChannelsEx(
 	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&edge_detector_map);
 	clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize);
 	clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize);
-	clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clstep);
-	clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clres_size);
+	clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clstep);
+	clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result);
 
-	size_t globalWorkSize[2] = { res_xsize, res_ysize};
+	size_t globalWorkSize[2] = { work_xsize, work_ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -1039,8 +1042,9 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 								 float* r2, float* g2, float* b2,
 								 size_t xsize, size_t ysize,
+								 size_t res_xsize, size_t res_ysize,
 								 size_t step,
-								 float* result)
+								 float* result, size_t result_len)
 {
 
 	cl_int channel_size = xsize * ysize * sizeof(float);
@@ -1059,13 +1063,14 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
 	err = clFinish(ocl.commandQueue);
 
-	cl_mem mem_result = ocl.allocMem(channel_size);
+	cl_mem mem_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float));
+	const float pattern = 0;
+	clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, result_len, result, 0, NULL, NULL);
+
 	ocl_channels mask = ocl.allocMemChannels(channel_size);
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (ysize + step - 1) / step;
-
 	cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
 	cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
 	cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
@@ -1078,7 +1083,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
 	clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
 
-	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, step, mem_result);
+	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result);
 
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 3c05aeb9..9fe49b8f 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -25,13 +25,16 @@ void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
 
 void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
 
+void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize);
+
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	float* r2, float* g2, float* b2,
 	size_t xsize, size_t ysize,
+	size_t res_xsize, size_t res_ysize,
 	size_t step,
-	float* result);
+	float* result, size_t result_len);
 
 void clCombineChannelsEx(
 	ocl_channels mask,
@@ -40,6 +43,7 @@ void clCombineChannelsEx(
 	cl_mem block_diff_ac,
 	cl_mem edge_detector_map,
 	size_t xsize, size_t ysize,
+	size_t res_xsize,
 	size_t step,
 	cl_mem result/*out*/);
 
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 25156a59..9b96120a 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -30,8 +30,6 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -81,8 +79,6 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	size_t channel_size = xsize * ysize * sizeof(float);
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -123,8 +119,6 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	size_t xsize, size_t ysize, size_t step,
 	const float* result_diff_dc, const float* result_diff_ac)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	size_t channel_size = xsize * ysize * sizeof(float);
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -173,8 +167,6 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
     const float* orign_ac,
 	const float* result_diff_ac)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	size_t channel_size = xsize * ysize * sizeof(float);
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -219,8 +211,6 @@ void tclMask(const float* r, const float* g, const float* b,
 	const float* mask_r, const float* mask_g, const float* mask_b,
 	const float* maskdc_r, const float* maskdc_g, const float* maskdc_b)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -269,7 +259,6 @@ void tclMask(const float* r, const float* g, const float* b,
 	ocl.releaseMemChannels(mask_dc);
 }
 
-// ian todo
 void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const float *mask_xyb_b,
 	const float *mask_xyb_dc_x, const float *mask_xyb_dc_y, const float *mask_xyb_dc_b,
 	const float *block_diff_dc,	const float *block_diff_ac,
@@ -277,6 +266,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	size_t xsize, size_t ysize,
 	size_t res_xsize, size_t res_ysize,
 	size_t step,
+	float *init_result,
 	float *result)
 {
 	cl_int err = CL_SUCCESS;
@@ -299,8 +289,10 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_dc, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_dc, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_ac, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_ac, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, cl_edge_detector_map, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), edge_detector_map, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, cl_result, CL_FALSE, 0, res_xsize * res_ysize * sizeof(float), init_result, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
 
-	clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, step, cl_result);
+	clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, res_xsize, step, cl_result);
 
 	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err);
 
@@ -325,8 +317,6 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 // chrisk todo
 void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result)
 {
-    if (xsize < 100 || ysize < 100) return;
-
     size_t channel_size = xsize * ysize * sizeof(float);
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
@@ -357,8 +347,6 @@ void tclConvolution(float* result, size_t xsize, size_t ysize,
 	float border_ratio,
 	float* orign_result)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	int dxsize = (xsize + xstep - 1) / xstep;
 	size_t result_size = dxsize * ysize * sizeof(float);
 	size_t inp_size = xsize * ysize * sizeof(float);
@@ -367,7 +355,7 @@ void tclConvolution(float* result, size_t xsize, size_t ysize,
 	ocl_args_d_t &ocl = getOcl();
 	cl_mem r = ocl.allocMem(result_size);
 	cl_mem i = ocl.allocMem(inp_size);
-	cl_mem m = ocl.allocMem(len);
+	cl_mem m = ocl.allocMem(multipliers_size);
 
 	clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, result_size, result, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL);
@@ -416,8 +404,6 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	float *values)
 {
-	if (xsize < 100 || ysize < 100) return;
-
 	size_t img_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -449,5 +435,31 @@ void tclScaleImage(double scale, float *result)
 void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
 	float* result_r, float* result_g, float* result_b)
 {
+	size_t channel_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	ocl_channels rgb = ocl.allocMemChannels(channel_size);
+
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clOpsinDynamicsImageEx(rgb, xsize, ysize);
 
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result_r, r_r, xsize * ysize);
+	FLOAT_COMPARE(result_g, r_g, xsize * ysize);
+	FLOAT_COMPARE(result_b, r_b, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, channel_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	ocl.releaseMemChannels(rgb);
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 4e94c490..12d9d057 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -40,6 +40,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	size_t xsize, size_t ysize,
 	size_t res_xsize, size_t res_ysize,
 	size_t step,
+	float *init_result,
 	float *result);
 
 void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 9168db05..ae2b8030 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1172,7 +1172,7 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
 	{
 		result.resize(xsize_ * ysize_);
 		clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(),
-			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
+			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, res_xsize_, res_ysize_, result.data(), result.size());
 		return;
 	}
 
@@ -1368,8 +1368,16 @@ void ButteraugliComparator::CombineChannels(
     const std::vector<float>& block_diff_ac,
     const std::vector<float>& edge_detector_map,
     std::vector<float>* result) {
+
   PROFILER_FUNC;
   result->resize(res_xsize_ * res_ysize_);
+
+  std::vector<float> temp;
+  if (g_checkOpenCL)
+  {
+	  temp = *result;
+  }
+
   for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
     for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) {
       size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
@@ -1391,7 +1399,7 @@ void ButteraugliComparator::CombineChannels(
 	  tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
 		  mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
 		  block_diff_dc.data(),
-		  block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, (*result).data());
+		  block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]);
   }
 }
 

From 6ce71751c779613281c22140d112749ff431aad1 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 11 May 2017 01:19:20 +0800
Subject: [PATCH 057/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3OpsinDynamicsImage?=
 =?UTF-8?q?=E8=BF=90=E7=AE=97=E7=BB=93=E6=9E=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl                             | 4 +++-
 third_party/butteraugli/butteraugli/butteraugli.cc | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index b2f58c66..2f5478b8 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -266,17 +266,19 @@ __kernel void OpsinDynamicsImage(
 	double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
 	double pre_mixed[3];
 	OpsinAbsorbance(pre, pre_mixed);
+
 	double sensitivity[3];
 	sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
 	sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
 	sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
 
-	double cur_rgb[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
+	double cur_rgb[3] = { r[i], g[i],  b[i] };
 	double cur_mixed[3];
     OpsinAbsorbance(cur_rgb, cur_mixed);
     cur_mixed[0] *= sensitivity[0];
     cur_mixed[1] *= sensitivity[1];
     cur_mixed[2] *= sensitivity[2];
+
     double x, y, z;
 	RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
     r[i] = x;
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index ae2b8030..aa8f9e75 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1064,7 +1064,7 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
       double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] };
       double pre_mixed[3];
       OpsinAbsorbance(pre_rgb, pre_mixed);
-      sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+      sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];����
       sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
       sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
     }

From 44df7121056b8167b25fbf6833a7df1f100846e0 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 11 May 2017 09:44:00 +0800
Subject: [PATCH 058/189] remove redundant parameter

---
 clguetzli/clguetzli_test.cpp                       | 11 +++++------
 clguetzli/clguetzli_test.h                         |  4 ++--
 third_party/butteraugli/butteraugli/butteraugli.cc | 13 ++-----------
 3 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 9b96120a..6679d0db 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -339,13 +339,13 @@ void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double bo
 }
 
 // chrisk todo
-void tclConvolution(float* result, size_t xsize, size_t ysize,
+void tclConvolution(size_t xsize, size_t ysize,
 	size_t xstep,
 	size_t len, size_t offset,
 	const float* multipliers,
 	const float* inp,
 	float border_ratio,
-	float* orign_result)
+	float* result)
 {
 	int dxsize = (xsize + xstep - 1) / xstep;
 	size_t result_size = dxsize * ysize * sizeof(float);
@@ -353,11 +353,11 @@ void tclConvolution(float* result, size_t xsize, size_t ysize,
 	size_t multipliers_size = len * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem r = ocl.allocMem(result_size);
+	ocl.allocA(result_size);
+	cl_mem r = ocl.srcA;
 	cl_mem i = ocl.allocMem(inp_size);
 	cl_mem m = ocl.allocMem(multipliers_size);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, result_size, result, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, m, CL_FALSE, 0, multipliers_size, multipliers, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
@@ -367,12 +367,11 @@ void tclConvolution(float* result, size_t xsize, size_t ysize,
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	FLOAT_COMPARE(orign_result, r_r, dxsize * ysize);
+	FLOAT_COMPARE(result, r_r, dxsize * ysize);
 
 	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
-	clReleaseMemObject(r);
 	clReleaseMemObject(i);
 	clReleaseMemObject(m);
 }
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 12d9d057..1ce0466c 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -47,13 +47,13 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
 	float *diffmap);
 
-void tclConvolution(float* result, size_t xsize, size_t ysize,
+void tclConvolution(size_t xsize, size_t ysize,
 	size_t xstep,
 	size_t len, size_t offset,
 	const float* multipliers,
 	const float* inp,
 	float border_ratio,
-	float* orign_result);
+	float* result);
 
 void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma,
 	double border_ratio);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index aa8f9e75..948fea2e 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -69,15 +69,6 @@ static void Convolution(size_t xsize, size_t ysize,
 	const float* __restrict__ inp,
 	float border_ratio,
 	float* __restrict__ result) {
-
-	std::vector<float> newResult;
-	if (g_checkOpenCL)
-	{
-		int dxsize = (xsize + xstep - 1) / xstep;
-		newResult.resize(dxsize * ysize);
-		memcpy(newResult.data(), result, dxsize * ysize * sizeof(float));
-	}
-
   PROFILER_FUNC;
   float weight_no_border = 0;
 
@@ -105,7 +96,7 @@ static void Convolution(size_t xsize, size_t ysize,
 
   if (g_checkOpenCL)
   {
-	  tclConvolution(newResult.data(), xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+	  tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
   }
 }
 
@@ -1064,7 +1055,7 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
       double pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] };
       double pre_mixed[3];
       OpsinAbsorbance(pre_rgb, pre_mixed);
-      sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];����
+      sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
       sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
       sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
     }

From 79cb8cd9446ee2fd4af865ae7d2c182bebadca5e Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 11 May 2017 17:04:19 +0800
Subject: [PATCH 059/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clguetzli.cl                        | 80 +++++++++------
 clguetzli/clguetzli.cpp                       | 24 ++---
 clguetzli/clguetzli.h                         | 10 ++
 clguetzli/clguetzli_test.cpp                  | 97 +++++++++++++++++--
 clguetzli/clguetzli_test.h                    | 15 ++-
 .../butteraugli/butteraugli/butteraugli.cc    | 24 +++--
 6 files changed, 192 insertions(+), 58 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 2f5478b8..b4e3cdc8 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -172,7 +172,7 @@ __kernel void DownSample(__global float* pA, __global float* pC, int xstep, int
 	const int xsize = get_global_size(0);
 	const int ysize = get_global_size(1);
 
-	const int oxsize = xsize / xstep;
+	const int oxsize = (xsize + xstep - 1) / xstep;
 
 	const int sample_x = x / xstep;
 	const int sample_y = y / ystep;
@@ -1140,30 +1140,56 @@ __kernel void MaskHighIntensityChange(
 }
 
 
-#define XybToVals_off 11.38708334481672
-#define XybToVals_inc 14.550189611520716
-__constant double XybToVals_lut[21] = {
+#define XybToVals_off_x 11.38708334481672
+#define XybToVals_inc_x 14.550189611520716
+__constant double XybToVals_lut_x[21] = {
 	0,
-	XybToVals_off,
-	XybToVals_off + 1 * XybToVals_inc,
-	XybToVals_off + 2 * XybToVals_inc,
-	XybToVals_off + 3 * XybToVals_inc,
-	XybToVals_off + 4 * XybToVals_inc,
-	XybToVals_off + 5 * XybToVals_inc,
-	XybToVals_off + 6 * XybToVals_inc,
-	XybToVals_off + 7 * XybToVals_inc,
-	XybToVals_off + 8 * XybToVals_inc,
-	XybToVals_off + 9 * XybToVals_inc,
-	XybToVals_off + 10 * XybToVals_inc,
-	XybToVals_off + 11 * XybToVals_inc,
-	XybToVals_off + 12 * XybToVals_inc,
-	XybToVals_off + 13 * XybToVals_inc,
-	XybToVals_off + 14 * XybToVals_inc,
-	XybToVals_off + 15 * XybToVals_inc,
-	XybToVals_off + 16 * XybToVals_inc,
-	XybToVals_off + 17 * XybToVals_inc,
-	XybToVals_off + 18 * XybToVals_inc,
-	XybToVals_off + 19 * XybToVals_inc,
+	XybToVals_off_x,
+	XybToVals_off_x + 1 * XybToVals_inc_x,
+	XybToVals_off_x + 2 * XybToVals_inc_x,
+	XybToVals_off_x + 3 * XybToVals_inc_x,
+	XybToVals_off_x + 4 * XybToVals_inc_x,
+	XybToVals_off_x + 5 * XybToVals_inc_x,
+	XybToVals_off_x + 6 * XybToVals_inc_x,
+	XybToVals_off_x + 7 * XybToVals_inc_x,
+	XybToVals_off_x + 8 * XybToVals_inc_x,
+	XybToVals_off_x + 9 * XybToVals_inc_x,
+	XybToVals_off_x + 10 * XybToVals_inc_x,
+	XybToVals_off_x + 11 * XybToVals_inc_x,
+	XybToVals_off_x + 12 * XybToVals_inc_x,
+	XybToVals_off_x + 13 * XybToVals_inc_x,
+	XybToVals_off_x + 14 * XybToVals_inc_x,
+	XybToVals_off_x + 15 * XybToVals_inc_x,
+	XybToVals_off_x + 16 * XybToVals_inc_x,
+	XybToVals_off_x + 17 * XybToVals_inc_x,
+	XybToVals_off_x + 18 * XybToVals_inc_x,
+	XybToVals_off_x + 19 * XybToVals_inc_x,
+};
+
+#define XybToVals_off_y 1.4103373714040413
+#define XybToVals_inc_y 0.7084088867024
+__constant double XybToVals_lut_y[21] = {
+	0,
+	XybToVals_off_y,
+	XybToVals_off_y + 1 * XybToVals_inc_y,
+	XybToVals_off_y + 2 * XybToVals_inc_y,
+	XybToVals_off_y + 3 * XybToVals_inc_y,
+	XybToVals_off_y + 4 * XybToVals_inc_y,
+	XybToVals_off_y + 5 * XybToVals_inc_y,
+	XybToVals_off_y + 6 * XybToVals_inc_y,
+	XybToVals_off_y + 7 * XybToVals_inc_y,
+	XybToVals_off_y + 8 * XybToVals_inc_y,
+	XybToVals_off_y + 9 * XybToVals_inc_y,
+	XybToVals_off_y + 10 * XybToVals_inc_y,
+	XybToVals_off_y + 11 * XybToVals_inc_y,
+	XybToVals_off_y + 12 * XybToVals_inc_y,
+	XybToVals_off_y + 13 * XybToVals_inc_y,
+	XybToVals_off_y + 14 * XybToVals_inc_y,
+	XybToVals_off_y + 15 * XybToVals_inc_y,
+	XybToVals_off_y + 16 * XybToVals_inc_y,
+	XybToVals_off_y + 17 * XybToVals_inc_y,
+	XybToVals_off_y + 18 * XybToVals_inc_y,
+	XybToVals_off_y + 19 * XybToVals_inc_y,
 };
 
 void XybToVals(
@@ -1174,8 +1200,8 @@ void XybToVals(
     const double ymul = 2.28148649801;
 	const double zmul = 1.87816926918;
 
-	*valx = Interpolate(&XybToVals_lut[0], 21, x * xmul);
-	*valy = Interpolate(&XybToVals_lut[0], 21, y * ymul);
+	*valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul);
+	*valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul);
 	*valz = zmul * z;
 }
 
@@ -1195,7 +1221,7 @@ __kernel void DiffPrecompute(
 	double valsv1[3] = { 0.0 };
 	int ix2;
 
-	size_t ix = x + xsize * y;
+	int ix = x + xsize * y;
 	if (x + 1 < xsize) {
 		ix2 = ix + 1;
 	}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 542c7858..b37eabcd 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -594,14 +594,14 @@ void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g);
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.x);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.y);
 	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.x);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.y);
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.r);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.g);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.x);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.y);
 	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mask.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
@@ -935,14 +935,15 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 	cl_int clxsize = xsize;
 	cl_int clysize = ysize;
 	cl_int clstep = step;
-	ocl.allocC(xsize * ysize * sizeof(float));
+
+  cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float));
 
 	cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap);
 	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize);
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize);
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&ocl.dstMem);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_diffmap);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -954,7 +955,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 		LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
 	err = clFinish(ocl.commandQueue);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.dstMem, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, mem_diffmap, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
@@ -964,6 +965,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 	{
 		LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
+  clReleaseMemObject(mem_diffmap);
 }
 
 void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred)
@@ -1029,14 +1031,14 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 	int s2 = (8 - step) / 2;
 
 	ocl_args_d_t &ocl = getOcl();
-	ocl.allocA((xsize - s) * (ysize - s) * sizeof(float));
-	cl_mem blurred = ocl.srcA;
+  cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
 	clCalculateDiffmapGetBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred);
 
 	static const double border_ratio = 0.03027655136;
 	clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
 	clGetDiffmapFromBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred);
 	clScaleImageEx(diffmap, xsize * ysize, scale);
+  clReleaseMemObject(blurred);
 }
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 9fe49b8f..13111404 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -53,3 +53,13 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 	cl_mem result/*out*/);
 
 void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset);
+
+void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
+	size_t xstep, size_t ystep,
+	cl_mem result/*out*/);
+
+void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step);
+
+void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
+
+void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 6679d0db..8965a633 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -1,6 +1,7 @@
 #include <CL/cl.h>
 #include <math.h>
 #include <assert.h>
+#include <vector>
 #include "clguetzli_test.h"
 #include "clguetzli.h"
 #include "ocl.h"
@@ -298,6 +299,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 
 	FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize);
 
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, res_xsize * res_ysize * sizeof(float), NULL, NULL);
 	ocl.releaseMemChannels(mask);
 	ocl.releaseMemChannels(mask_dc);
 	clReleaseMemObject(cl_block_diff_dc);
@@ -309,9 +311,21 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 // ian todo
 void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
-	float *diffmap)
+	float *diffmap, size_t org_len,
+	float *diffmap_cmp)
 {
+	cl_int err = CL_SUCCESS;
+	ocl_args_d_t &ocl = getOcl();
 
+	size_t length = xsize * ysize * sizeof(float);
+	cl_mem mem_diffmap = ocl.allocMem(length);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL);
+	clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step);
+	//cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err);
+  //err = clFinish(ocl.commandQueue);
+	//FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize);
+  //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, length, NULL, NULL);
+	clReleaseMemObject(mem_diffmap);
 }
 
 // chrisk todo
@@ -377,19 +391,75 @@ void tclConvolution(size_t xsize, size_t ysize,
 }
 
 // chirsk todo
-void tclUpsample(void)
+void tclUpsample(float* image, size_t xsize, size_t ysize,
+	size_t xstep, size_t ystep,
+	float* result)
 {
+	int dxsize = (xsize + xstep - 1) / xstep;
+	int dysize = (ysize + ystep - 1) / ystep;
+	size_t img_size = dxsize * dysize * sizeof(float);
+	size_t result_size = xsize * ysize * sizeof(float);
+	cl_int err = 0;
+	ocl_args_d_t &ocl = getOcl();
+	cl_mem img = ocl.allocMem(img_size);
+	ocl.allocA(result_size);
+	cl_mem r = ocl.srcA;
+
+	clEnqueueWriteBuffer(ocl.commandQueue, img, CL_FALSE, 0, img_size, image, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clUpsampleEx(img, xsize, ysize, xstep, ystep, r);
 
+	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
+	err = clFinish(ocl.commandQueue);
+
+	FLOAT_COMPARE(result, r_r, xsize * ysize);
+
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
+	clReleaseMemObject(img);
 }
 
 // ian todo
 void tclDiffPrecompute(
-	const float *xyb0_x, const float *xyb0_y, const float *xyb0_b,
-	const float *xyb1_x, const float *xyb1_y, const float *xyb1_b,
+  const std::vector<std::vector<float> > &xyb0,
+  const std::vector<std::vector<float> > &xyb1,
 	size_t xsize, size_t ysize,
-	float *mask_x, float *mask_y, float *mask_b)
+  std::vector<std::vector<float> > *mask_cmp)
 {
-
+  cl_int err = 0;
+  ocl_args_d_t &ocl = getOcl();
+  size_t channel_size = xsize * ysize * sizeof(float);
+  ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size);
+  ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size);
+  ocl_channels cl_mask = ocl.allocMemChannels(channel_size);
+
+  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.x, CL_FALSE, 0, channel_size, xyb0[0].data(), 0, NULL, NULL);
+  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.y, CL_FALSE, 0, channel_size, xyb0[1].data(), 0, NULL, NULL);
+  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.b, CL_FALSE, 0, channel_size, xyb0[2].data(), 0, NULL, NULL);
+  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.x, CL_FALSE, 0, channel_size, xyb1[0].data(), 0, NULL, NULL);
+  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.y, CL_FALSE, 0, channel_size, xyb1[1].data(), 0, NULL, NULL);
+  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.b, CL_FALSE, 0, channel_size, xyb1[2].data(), 0, NULL, NULL);
+
+
+  clDiffPrecomputeEx(cl_xyb0, cl_xyb1, xsize, ysize, cl_mask);
+
+  cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+  cl_float *r_y = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.y, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+  cl_float *r_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+  err = clFinish(ocl.commandQueue);
+
+  FLOAT_COMPARE(r_x, (*mask_cmp)[0].data(), xsize * ysize);
+  FLOAT_COMPARE(r_y, (*mask_cmp)[1].data(), xsize * ysize);
+  FLOAT_COMPARE(r_b, (*mask_cmp)[2].data(), xsize * ysize);
+
+  ocl.releaseMemChannels(cl_xyb0);
+  ocl.releaseMemChannels(cl_xyb1);
+  ocl.releaseMemChannels(cl_mask);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, channel_size, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, channel_size, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, channel_size, NULL, NULL);
 }
 
 // ian todo
@@ -424,10 +494,21 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset,
 	clReleaseMemObject(r);
 }
 
-// ian todo
-void tclScaleImage(double scale, float *result)
+void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length)
 {
+  cl_int err = 0;
+  ocl_args_d_t &ocl = getOcl();
+  cl_mem mem_result_org = ocl.allocMem(length * sizeof(float));
+  clEnqueueWriteBuffer(ocl.commandQueue, mem_result_org, CL_FALSE, 0, length * sizeof(float), result_org, 0, NULL, NULL);
+  clScaleImageEx(mem_result_org, length, scale);
+
+  cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err);
+  err = clFinish(ocl.commandQueue);
+
+  FLOAT_COMPARE(r_r, result_cmp, length);
 
+  clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, length * sizeof(float), NULL, NULL);
+  clReleaseMemObject(mem_result_org);
 }
 
 // strong todo
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 1ce0466c..4c8b4cb0 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -45,7 +45,8 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 
 void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
-	float *diffmap);
+	float *diffmap, size_t org_len,
+	float *diffmap_cmp);
 
 void tclConvolution(size_t xsize, size_t ysize,
 	size_t xstep,
@@ -59,14 +60,14 @@ void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma,
 	double border_ratio);
 
 void tclDiffPrecompute(
-	const float *xyb0_x, const float *xyb0_y, const float *xyb0_b,
-	const float *xyb1_x, const float *xyb1_y, const float *xyb1_b,
+  const std::vector<std::vector<float> > &xyb0,
+  const std::vector<std::vector<float> > &xyb1,
 	size_t xsize, size_t ysize,
-	float *mask_x, float *mask_y, float *mask_b);
+  std::vector<std::vector<float> > *mask_cmp);
 
 void tclAverage5x5(int xsize, int ysize, float *diffs);
 
-void tclScaleImage(double scale, float *result);
+void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length);
 
 void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
 	float* result_r, float* result_g, float* result_b);
@@ -74,3 +75,7 @@ void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ys
 void tclMinSquareVal(float *img, size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	float *values);
+
+void tclUpsample(float* image, size_t xsize, size_t ysize,
+	size_t xstep, size_t ystep,
+	float* result);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 948fea2e..99b14e31 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -144,6 +144,10 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
             downsampled_output[(y / ystep) * dxsize + (x / xstep)];
       }
     }
+	if (g_checkOpenCL)
+	{
+		tclUpsample(downsampled_output.data(), xsize, ysize, xstep, ystep, channel);
+	}
   }
 
   if (g_checkOpenCL)
@@ -1080,6 +1084,11 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
 }
 
 static void ScaleImage(double scale, std::vector<float> *result) {
+  std::vector<float> result_org;
+	if (g_checkOpenCL)
+	{
+    result_org = *result;
+	}
   PROFILER_FUNC;
   for (size_t i = 0; i < result->size(); ++i) {
     (*result)[i] *= static_cast<float>(scale);
@@ -1087,7 +1096,7 @@ static void ScaleImage(double scale, std::vector<float> *result) {
 
   if (g_checkOpenCL)
   {
-	  tclScaleImage(scale, (*result).data());
+    tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
   }
 }
 
@@ -1096,6 +1105,11 @@ static void ScaleImage(double scale, std::vector<float> *result) {
 void CalculateDiffmap(const size_t xsize, const size_t ysize,
                       const size_t step,
                       std::vector<float>* diffmap) {
+  std::vector<float> diffmap_org;
+  if (g_checkOpenCL)
+  {
+	  diffmap_org = *diffmap;
+  }
   PROFILER_FUNC;
   // Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5
   // pixels distance over the original image. The border of 2 pixels on top and
@@ -1150,7 +1164,7 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize,
   }
   if (g_checkOpenCL)
   {
-	  tclCalculateDiffmap(xsize, ysize, step, (*diffmap).data());
+	  tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
   }
 }
 
@@ -1661,11 +1675,7 @@ void DiffPrecompute(
 
   if (g_checkOpenCL)
   {
-	  tclDiffPrecompute(
-		  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-		  xsize, ysize,
-		  ((*mask)[0]).data(), ((*mask)[1]).data(), ((*mask)[2]).data());
+	  tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
   }
 }
 

From 7b9cf14d13bcc944387b2a4acf98c33fdaa093b0 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 11 May 2017 23:49:35 +0800
Subject: [PATCH 060/189] Add tclAverage55

---
 clguetzli/clguetzli.h                              |  2 ++
 clguetzli/clguetzli_test.cpp                       | 12 +++++++++++-
 clguetzli/clguetzli_test.h                         |  2 +-
 third_party/butteraugli/butteraugli/butteraugli.cc |  8 +++++++-
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 13111404..583e37e0 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -63,3 +63,5 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
 
 void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/);
+
+void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 8965a633..38e3e966 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -463,9 +463,19 @@ void tclDiffPrecompute(
 }
 
 // ian todo
-void tclAverage5x5(int xsize, int ysize, float *diffs)
+void tclAverage5x5(int xsize, int ysize, std::vector<float> &diffs_org, std::vector<float> &diffs_cmp)
 {
+  cl_int err = 0;
+  ocl_args_d_t &ocl = getOcl();
+  cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float));
+  clEnqueueWriteBuffer(ocl.commandQueue, mem_diff, CL_FALSE, 0, xsize * ysize * sizeof(float), diffs_org.data(), 0, NULL, NULL);
+  clAverage5x5Ex(mem_diff, xsize, ysize);
+  cl_float *r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diff, true, CL_MAP_READ, 0, xsize * ysize * sizeof(float), 0, NULL, NULL, &err);
+  err = clFinish(ocl.commandQueue);
+  FLOAT_COMPARE(r, diffs_cmp.data(), xsize * ysize);
 
+  clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, xsize * ysize * sizeof(float), NULL, NULL);
+  clReleaseMemObject(mem_diff);
 }
 
 // chrisk todo
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 4c8b4cb0..226d3d0a 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -65,7 +65,7 @@ void tclDiffPrecompute(
 	size_t xsize, size_t ysize,
   std::vector<std::vector<float> > *mask_cmp);
 
-void tclAverage5x5(int xsize, int ysize, float *diffs);
+void tclAverage5x5(int xsize, int ysize, std::vector<float> &diffs_org, std::vector<float> &diffs_cmp);
 
 void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length);
 
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 99b14e31..fdc1f49a 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1562,6 +1562,12 @@ void MinSquareVal(size_t square_size, size_t offset,
 
 // ===== Functions used by Mask only =====
 void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
+  std::vector<float> diffs_org;
+  if (g_checkOpenCL)
+  {
+    diffs_org = *diffs;
+  }
+
   PROFILER_FUNC;
   if (xsize < 4 || ysize < 4) {
     // TODO: Make this work for small dimensions as well.
@@ -1615,7 +1621,7 @@ void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
 
   if (g_checkOpenCL)
   {
-	  tclAverage5x5(xsize, ysize, (*diffs).data());
+	  tclAverage5x5(xsize, ysize, diffs_org, *diffs);
   }
 }
 

From 81c43547dcbf4041b31e3450aec7194dee979049 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 12 May 2017 14:01:22 +0800
Subject: [PATCH 061/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E7=BB=93=E6=9E=9C+=E5=A2=9E=E5=8A=A0comparator=E5=AD=90?=
 =?UTF-8?q?=E7=B1=BB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clBlock.cpp                         | 394 ++++++++++++++++++
 clguetzli/clBlock.h                           |  24 ++
 clguetzli/clguetzli.cl                        |  29 +-
 clguetzli/clguetzli.cpp                       |  99 +++--
 clguetzli/clguetzli.h                         |   3 +-
 clguetzli/ocl.h                               |   4 +-
 guetzli.vcxproj                               |   2 +
 guetzli.vcxproj.filters                       |   6 +
 guetzli/butteraugli_comparator.h              |   2 +-
 guetzli/processor.cc                          |  14 +-
 .../butteraugli/butteraugli/butteraugli.cc    |   3 +-
 11 files changed, 508 insertions(+), 72 deletions(-)
 create mode 100644 clguetzli/clBlock.cpp
 create mode 100644 clguetzli/clBlock.h

diff --git a/clguetzli/clBlock.cpp b/clguetzli/clBlock.cpp
new file mode 100644
index 00000000..4650a813
--- /dev/null
+++ b/clguetzli/clBlock.cpp
@@ -0,0 +1,394 @@
+#include <stdint.h>
+#include <algorithm>
+#include "clBlock.h"
+#include "guetzli\idct.h"
+
+
+typedef int16_t coeff_t;
+
+const double* NewSrgb8ToLinearTable() {
+	double* table = new double[256];
+	int i = 0;
+	for (; i < 11; ++i) {
+		table[i] = i / 12.92;
+	}
+	for (; i < 256; ++i) {
+		table[i] = 255.0 * std::pow(((i / 255.0) + 0.055) / 1.055, 2.4);
+	}
+	return table;
+}
+
+const double* Srgb8ToLinearTable() {
+	static const double* const kSrgb8ToLinearTable = NewSrgb8ToLinearTable();
+	return kSrgb8ToLinearTable;
+}
+
+static const int kCrToRedTable[256] = {
+	-179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164,
+	-163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147,
+	-146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130,
+	-129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114,
+	-112, -111, -109, -108, -107, -105, -104, -102, -101, -100,  -98,  -97,
+	-95,  -94,  -93,  -91,  -90,  -88,  -87,  -86,  -84,  -83,  -81,  -80,
+	-79,  -77,  -76,  -74,  -73,  -72,  -70,  -69,  -67,  -66,  -64,  -63,
+	-62,  -60,  -59,  -57,  -56,  -55,  -53,  -52,  -50,  -49,  -48,  -46,
+	-45,  -43,  -42,  -41,  -39,  -38,  -36,  -35,  -34,  -32,  -31,  -29,
+	-28,  -27,  -25,  -24,  -22,  -21,  -20,  -18,  -17,  -15,  -14,  -13,
+	-11,  -10,   -8,   -7,   -6,   -4,   -3,   -1,    0,    1,    3,    4,
+	6,    7,    8,   10,   11,   13,   14,   15,   17,   18,   20,   21,
+	22,   24,   25,   27,   28,   29,   31,   32,   34,   35,   36,   38,
+	39,   41,   42,   43,   45,   46,   48,   49,   50,   52,   53,   55,
+	56,   57,   59,   60,   62,   63,   64,   66,   67,   69,   70,   72,
+	73,   74,   76,   77,   79,   80,   81,   83,   84,   86,   87,   88,
+	90,   91,   93,   94,   95,   97,   98,  100,  101,  102,  104,  105,
+	107,  108,  109,  111,  112,  114,  115,  116,  118,  119,  121,  122,
+	123,  125,  126,  128,  129,  130,  132,  133,  135,  136,  137,  139,
+	140,  142,  143,  144,  146,  147,  149,  150,  151,  153,  154,  156,
+	157,  158,  160,  161,  163,  164,  165,  167,  168,  170,  171,  172,
+	174,  175,  177,  178
+};
+
+static const int kCbToBlueTable[256] = {
+	-227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207,
+	-206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186,
+	-184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165,
+	-163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144,
+	-142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122,
+	-120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101,
+	-99,  -97,  -96,  -94,  -92,  -90,  -89,  -87,  -85,  -83,  -82,  -80,
+	-78,  -76,  -74,  -73,  -71,  -69,  -67,  -66,  -64,  -62,  -60,  -58,
+	-57,  -55,  -53,  -51,  -50,  -48,  -46,  -44,  -43,  -41,  -39,  -37,
+	-35,  -34,  -32,  -30,  -28,  -27,  -25,  -23,  -21,  -19,  -18,  -16,
+	-14,  -12,  -11,   -9,   -7,   -5,   -4,   -2,    0,    2,    4,    5,
+	7,    9,   11,   12,   14,   16,   18,   19,   21,   23,   25,   27,
+	28,   30,   32,   34,   35,   37,   39,   41,   43,   44,   46,   48,
+	50,   51,   53,   55,   57,   58,   60,   62,   64,   66,   67,   69,
+	71,   73,   74,   76,   78,   80,   82,   83,   85,   87,   89,   90,
+	92,   94,   96,   97,   99,  101,  103,  105,  106,  108,  110,  112,
+	113,  115,  117,  119,  120,  122,  124,  126,  128,  129,  131,  133,
+	135,  136,  138,  140,  142,  144,  145,  147,  149,  151,  152,  154,
+	156,  158,  159,  161,  163,  165,  167,  168,  170,  172,  174,  175,
+	177,  179,  181,  183,  184,  186,  188,  190,  191,  193,  195,  197,
+	198,  200,  202,  204,  206,  207,  209,  211,  213,  214,  216,  218,
+	220,  222,  223,  225,
+};
+
+static const int kCrToGreenTable[256] = {
+	5990656,  5943854,  5897052,  5850250,  5803448,  5756646,  5709844,  5663042,
+	5616240,  5569438,  5522636,  5475834,  5429032,  5382230,  5335428,  5288626,
+	5241824,  5195022,  5148220,  5101418,  5054616,  5007814,  4961012,  4914210,
+	4867408,  4820606,  4773804,  4727002,  4680200,  4633398,  4586596,  4539794,
+	4492992,  4446190,  4399388,  4352586,  4305784,  4258982,  4212180,  4165378,
+	4118576,  4071774,  4024972,  3978170,  3931368,  3884566,  3837764,  3790962,
+	3744160,  3697358,  3650556,  3603754,  3556952,  3510150,  3463348,  3416546,
+	3369744,  3322942,  3276140,  3229338,  3182536,  3135734,  3088932,  3042130,
+	2995328,  2948526,  2901724,  2854922,  2808120,  2761318,  2714516,  2667714,
+	2620912,  2574110,  2527308,  2480506,  2433704,  2386902,  2340100,  2293298,
+	2246496,  2199694,  2152892,  2106090,  2059288,  2012486,  1965684,  1918882,
+	1872080,  1825278,  1778476,  1731674,  1684872,  1638070,  1591268,  1544466,
+	1497664,  1450862,  1404060,  1357258,  1310456,  1263654,  1216852,  1170050,
+	1123248,  1076446,  1029644,   982842,   936040,   889238,   842436,   795634,
+	748832,   702030,   655228,   608426,   561624,   514822,   468020,   421218,
+	374416,   327614,   280812,   234010,   187208,   140406,    93604,    46802,
+	0,   -46802,   -93604,  -140406,  -187208,  -234010,  -280812,  -327614,
+	-374416,  -421218,  -468020,  -514822,  -561624,  -608426,  -655228,  -702030,
+	-748832,  -795634,  -842436,  -889238,  -936040,  -982842, -1029644, -1076446,
+	-1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862,
+	-1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278,
+	-1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694,
+	-2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110,
+	-2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526,
+	-2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942,
+	-3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358,
+	-3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774,
+	-4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190,
+	-4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606,
+	-4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022,
+	-5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438,
+	-5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854,
+};
+
+static const int kCbToGreenTable[256] = {
+	2919680,  2897126,  2874572,  2852018,  2829464,  2806910,  2784356,  2761802,
+	2739248,  2716694,  2694140,  2671586,  2649032,  2626478,  2603924,  2581370,
+	2558816,  2536262,  2513708,  2491154,  2468600,  2446046,  2423492,  2400938,
+	2378384,  2355830,  2333276,  2310722,  2288168,  2265614,  2243060,  2220506,
+	2197952,  2175398,  2152844,  2130290,  2107736,  2085182,  2062628,  2040074,
+	2017520,  1994966,  1972412,  1949858,  1927304,  1904750,  1882196,  1859642,
+	1837088,  1814534,  1791980,  1769426,  1746872,  1724318,  1701764,  1679210,
+	1656656,  1634102,  1611548,  1588994,  1566440,  1543886,  1521332,  1498778,
+	1476224,  1453670,  1431116,  1408562,  1386008,  1363454,  1340900,  1318346,
+	1295792,  1273238,  1250684,  1228130,  1205576,  1183022,  1160468,  1137914,
+	1115360,  1092806,  1070252,  1047698,  1025144,  1002590,   980036,   957482,
+	934928,   912374,   889820,   867266,   844712,   822158,   799604,   777050,
+	754496,   731942,   709388,   686834,   664280,   641726,   619172,   596618,
+	574064,   551510,   528956,   506402,   483848,   461294,   438740,   416186,
+	393632,   371078,   348524,   325970,   303416,   280862,   258308,   235754,
+	213200,   190646,   168092,   145538,   122984,   100430,    77876,    55322,
+	32768,    10214,   -12340,   -34894,   -57448,   -80002,  -102556,  -125110,
+	-147664,  -170218,  -192772,  -215326,  -237880,  -260434,  -282988,  -305542,
+	-328096,  -350650,  -373204,  -395758,  -418312,  -440866,  -463420,  -485974,
+	-508528,  -531082,  -553636,  -576190,  -598744,  -621298,  -643852,  -666406,
+	-688960,  -711514,  -734068,  -756622,  -779176,  -801730,  -824284,  -846838,
+	-869392,  -891946,  -914500,  -937054,  -959608,  -982162, -1004716, -1027270,
+	-1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702,
+	-1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134,
+	-1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566,
+	-1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998,
+	-1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430,
+	-1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862,
+	-2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294,
+	-2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726,
+	-2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158,
+	-2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590,
+};
+
+static const uint8_t kRangeLimitLut[4 * 256] = {
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+static const uint8_t* kRangeLimit = kRangeLimitLut + 384;
+
+void CoeffToIDCT(coeff_t *block, uint8_t *idct)
+{
+	guetzli::ComputeBlockIDCT(block, idct);
+}
+
+void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_)
+{
+	const int block_x = 0;
+	const int block_y = 0;
+	const int width_ = 8;
+	const int height_ = 8;
+
+	for (int iy = 0; iy < 8; ++iy) {
+		for (int ix = 0; ix < 8; ++ix) {
+			int x = 8 * block_x + ix;
+			int y = 8 * block_y + iy;
+			if (x >= width_ || y >= height_) continue;
+			int p = y * width_ + x;
+			pixels_[p] = idct[8 * iy + ix] << 4;
+		}
+	}
+}
+
+// out = [YUVYUV....YUVYUV]
+void ImageToYUV(uint16_t *pixels_, uint8_t *out)
+{
+	const int ymin = 0;
+	const int xmin = 0;
+	const int ysize = 8;
+	const int xsize = 8;
+	const int width_ = 8;
+	const int height_ = 8;
+	const int stride = 3;
+
+	const int yend1 = ymin + ysize;
+	const int yend0 = std::min(yend1, height_);
+	int y = ymin;
+	for (; y < yend0; ++y) {
+		const int xend1 = xmin + xsize;
+		const int xend0 = std::min(xend1, width_);
+		int x = xmin;
+		int px = y * width_ + xmin;
+		for (; x < xend0; ++x, ++px, out += stride) {
+			*out = static_cast<uint8_t>((pixels_[px] + 8 - (x & 1)) >> 4);
+		}
+		const int offset = -stride;
+		for (; x < xend1; ++x) {
+			*out = out[offset];
+			out += stride;
+		}
+	}
+	for (; y < yend1; ++y) {
+		const int offset = -stride * xsize;
+		for (int x = 0; x < xsize; ++x) {
+			*out = out[offset];
+			out += stride;
+		}
+	}
+}
+
+// pixel = [YUVYUV...YUVYUV] to [RGBRGB...RGBRGB]
+void YUVToRGB(uint8_t* pixelBlock)
+{
+	for (int i = 0; i < 64; i++)
+	{
+		uint8_t *pixel = &pixelBlock[i*3];
+
+		int y = pixel[0];
+		int cb = pixel[1];
+		int cr = pixel[2];
+		pixel[0] = kRangeLimit[y + kCrToRedTable[cr]];
+		pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)];
+		pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]];
+	}
+}
+
+// block = [R....R][G....G][B.....]
+void BlockToImage(coeff_t *block, float* r, float* g, float* b)
+{
+	uint8_t idct[8 * 8 * 3];
+	CoeffToIDCT(&block[0], &idct[0]);
+	CoeffToIDCT(&block[8 * 8], &idct[8 * 8]);
+	CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]);
+
+	uint16_t pixels[8 * 8 * 3];
+
+	IDCTToImage(&idct[0], &pixels[0]);
+	IDCTToImage(&idct[8*8], &pixels[8*8]);
+	IDCTToImage(&idct[8*8*2], &pixels[8*8*2]);
+
+	uint8_t yuv[8 * 8 * 3];
+
+	ImageToYUV(&pixels[0], &yuv[0]);
+	ImageToYUV(&pixels[8*8], &yuv[8*8]);
+	ImageToYUV(&pixels[8*8*2], &yuv[8*8*2]);
+
+	YUVToRGB(yuv);
+
+	const double* lut = Srgb8ToLinearTable();
+	for (int i = 0; i < 8 * 8; i++)
+	{
+		r[i] = lut[yuv[3 * i]];
+		g[i] = lut[yuv[3 * i + 1]];
+		b[i] = lut[yuv[3 * i + 2]];
+	}
+}
+
+namespace guetzli
+{
+	ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
+		const std::vector<uint8_t>* rgb,
+		const float target_distance, ProcessStats* stats)
+		: ButteraugliComparator(width, height, rgb, target_distance, stats)
+	{
+
+	}
+
+	void ButteraugliComparatorEx::StartBlockComparisons()
+	{
+		ButteraugliComparator::StartBlockComparisons();
+	}
+
+	void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
+	{
+		ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
+	}
+
+	double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block)
+	{
+        return 0;
+		int block_x = block_x_ * factor_x_ + off_x;
+		int block_y = block_y_ * factor_y_ + off_y;
+		int xmin = 8 * block_x;
+		int ymin = 8 * block_y;
+		int block_ix = off_y * factor_x_ + off_x;
+		const std::vector<std::vector<float> >& rgb0_c = per_block_pregamma_[block_ix];
+
+		std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
+		img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
+
+		std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+		BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data());
+
+		::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
+
+		std::vector<std::vector<float> > rgb0 = rgb0_c;
+		std::vector<std::vector<float> > rgb1 = rgb1_c;
+
+		::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1);
+
+		double b0[3 * kDCTBlockSize];
+		double b1[3 * kDCTBlockSize];
+		for (int c = 0; c < 3; ++c) {
+			for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+				b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+				b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+			}
+		}
+		double diff_xyz_dc[3] = { 0.0 };
+		double diff_xyz_ac[3] = { 0.0 };
+		double diff_xyz_edge_dc[3] = { 0.0 };
+		::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+		double scale[3];
+		for (int c = 0; c < 3; ++c) {
+			scale[c] = mask_xyz_[c][ymin * width_ + xmin];
+		}
+
+		static const double kEdgeWeight = 0.05;
+
+		double diff = 0.0;
+		double diff_edge = 0.0;
+		for (int c = 0; c < 3; ++c) {
+			diff += diff_xyz_dc[c] * scale[c];
+			diff += diff_xyz_ac[c] * scale[c];
+			diff_edge += diff_xyz_edge_dc[c] * scale[c];
+		}
+		return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+	}
+}
diff --git a/clguetzli/clBlock.h b/clguetzli/clBlock.h
new file mode 100644
index 00000000..a3c91e71
--- /dev/null
+++ b/clguetzli/clBlock.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <vector>
+#include "guetzli\butteraugli_comparator.h"
+
+namespace guetzli {
+
+	class ButteraugliComparatorEx : public ButteraugliComparator
+	{
+	public:
+		ButteraugliComparatorEx(const int width, const int height,
+			const std::vector<uint8_t>* rgb,
+			const float target_distance, ProcessStats* stats);
+
+		void StartBlockComparisons();
+
+		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y);
+
+		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block);
+
+	protected:
+		std::vector<float> imgOpsinDynamicsBlockList;
+	};
+
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index b4e3cdc8..308ef1d3 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1276,39 +1276,46 @@ __kernel void UpsampleSquareRoot(__global float *diffmap, int xsize, int ysize,
 	const int res_x = get_global_id(0);
 	const int res_y = get_global_id(1);
 
-	if (res_y + 8 - step >= ysize) return;
-	if (res_x + 8 - step >= xsize) return;
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
+
+    const int pos_x = res_x * step;
+    const int pos_y = res_y * step;
+
+	if (pos_y + 8 - step >= ysize) return;
+	if (pos_x + 8 - step >= xsize) return;
 
 	int s2 = (8 - step) / 2;
+
 	// Upsample and take square root.
-	const size_t res_xsize = (xsize + step - 1) / step;
-	size_t res_ix = (res_y * res_xsize + res_x) / step;
-	float orig_val = diffmap[res_ix];
+	float orig_val = diffmap[res_y * res_xsize + res_x];
+
 	const float kInitialSlope = 100;
 	// TODO(b/29974893): Until that is fixed do not call sqrt on very small
 	// numbers.
 	double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
 		? kInitialSlope * orig_val
 		: sqrt(orig_val);
+
 	for (size_t off_y = 0; off_y < step; ++off_y) {
 		for (size_t off_x = 0; off_x < step; ++off_x) {
-			diffmap_out[(res_y + off_y + s2) * xsize +
-				res_x + off_x + s2] = val;
+			diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val;
 		}
 	}
 }
 
-kernel void CalculateDiffmapGetBlurred(__global float *diffmap, int s, int s2, __global float *blurred)
+kernel void removeBorder(__global float *in, int in_xsize, int s, int s2, __global float *out)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
+
 	const int xsize = get_global_size(0);
 	const int ysize = get_global_size(1);
 
-	blurred[y * xsize + x] = diffmap[(y + s2) * xsize + s + x + s2];
+	out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
-kernel void GetDiffmapFromBlurred(__global float *blurred, int s, int s2, __global float *diffmap)
+kernel void addBorder(__global float *out, int s, int s2, __global float *in)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
@@ -1316,7 +1323,7 @@ kernel void GetDiffmapFromBlurred(__global float *blurred, int s, int s2, __glob
 	const int ysize = get_global_size(1);
 
 	const double mul1 = 24.8235314874;
-	diffmap[(y + s2) * xsize + x + s2]	+= (float)(mul1) * blurred[y * (xsize - s) + x];
+	out[(y + s2) * xsize + x + s2]	+= (float)(mul1) * in[y * (xsize - s) + x];
 
 }
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index b37eabcd..38d31785 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -57,8 +57,8 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err);
 	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err);
 	ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "UpsampleSquareRoot", &err);
-	ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED] = clCreateKernel(ocl.program, "CalculateDiffmapGetBlurred", &err);
-	ocl.kernel[KERNEL_GETDIFFMAPFROMBLURRED] = clCreateKernel(ocl.program, "GetDiffmapFromBlurred", &err);
+	ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "addBorder", &err);
+	ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "removeBorder", &err);
 	ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err);
 	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "edgeDetectorMap", &err);
 	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "blockDiffMap", &err);
@@ -936,7 +936,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 	cl_int clysize = ysize;
 	cl_int clstep = step;
 
-  cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float));
+    cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float));
 
 	cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap);
@@ -965,23 +965,26 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
 	{
 		LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
-  clReleaseMemObject(mem_diffmap);
+
+    clReleaseMemObject(mem_diffmap);
 }
 
-void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred)
+void clRemoveBorderEx(cl_mem in, size_t xsize, size_t ysize, int step, cl_mem out)
 {
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int cls = s;
-	cl_int cls2 = s2;
-	cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap);
-	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&blurred);
-
-	size_t globalWorkSize[2] = { xsize, ysize };
+	cl_int cls = 8 - step;
+	cl_int cls2 = (8 - step) / 2;
+    cl_int clxsize = xsize;
+	cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), &in);
+    clSetKernelArg(kernel, 1, sizeof(cl_int), &clxsize);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), &cls);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), &cls2);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
+
+	size_t globalWorkSize[2] = { xsize - cls, ysize - cls};
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -994,20 +997,20 @@ void clCalculateDiffmapGetBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize,
 	}
 }
 
-void clGetDiffmapFromBlurredEx(cl_mem diffmap, size_t xsize, size_t ysize, int s, int s2, cl_mem blurred)
+void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
 {
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int cls = s;
-	cl_int cls2 = s2;
-	cl_kernel kernel = ocl.kernel[KERNEL_CALCULATEDIFFMAPGETBLURRED];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&blurred);
-	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&s);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&s2);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&diffmap);
+    cl_int cls = 8 - step;
+    cl_int cls2 = (8 - step) / 2;
+	cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER];
+	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&out);
+	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&cls);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&cls2);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in);
 
-	size_t globalWorkSize[2] = { xsize, ysize };
+	size_t globalWorkSize[2] = { xsize - cls, ysize - cls };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
@@ -1027,29 +1030,35 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 	static const double kSigma = 8.8510880283;
 	static const double mul1 = 24.8235314874;
 	static const double scale = 1.0 / (1.0 + mul1);
+
 	const int s = 8 - step;
 	int s2 = (8 - step) / 2;
 
 	ocl_args_d_t &ocl = getOcl();
-  cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
-	clCalculateDiffmapGetBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred);
+	cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+	clRemoveBorderEx(diffmap, xsize, ysize, step, blurred);
 
 	static const double border_ratio = 0.03027655136;
 	clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
-	clGetDiffmapFromBlurredEx(diffmap, (xsize - s), (ysize - s), s, s2, blurred);
+
+	clAddBorderEx(diffmap, xsize, ysize, step, blurred);
 	clScaleImageEx(diffmap, xsize * ysize, scale);
-  clReleaseMemObject(blurred);
+
+	clReleaseMemObject(blurred);
 }
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 								 float* r2, float* g2, float* b2,
 								 size_t xsize, size_t ysize,
-								 size_t res_xsize, size_t res_ysize,
 								 size_t step,
-								 float* result, size_t result_len)
+								 float* result)
 {
 
-	cl_int channel_size = xsize * ysize * sizeof(float);
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+
+	cl_int channel_size      = xsize * ysize * sizeof(float);
+	cl_int channel_step_size = res_xsize * res_ysize * sizeof(float);
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -1062,34 +1071,35 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-
 	err = clFinish(ocl.commandQueue);
 
-	cl_mem mem_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float));
+	cl_mem mem_result = ocl.allocMem(channel_size);
 	const float pattern = 0;
 	clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, result_len, result, 0, NULL, NULL);
-
-	ocl_channels mask = ocl.allocMemChannels(channel_size);
-	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, channel_step_size, result, 0, NULL, NULL);
 
-	cl_mem edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
-	cl_mem block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
-	cl_mem block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
+	cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
+	cl_mem block_diff_dc	 = ocl.allocMem(3 * channel_step_size);
+	cl_mem block_diff_ac	 = ocl.allocMem(3 * channel_step_size);
 
 	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
 	clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map);
 	clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
 	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
+    {
+        ocl_channels mask = ocl.allocMemChannels(channel_size);
+        ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+        clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+        clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result);
 
-	clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
-
-	clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result);
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
 
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
-	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 	memcpy(result, result_r, channel_size);
 
@@ -1103,8 +1113,5 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	clReleaseMemObject(block_diff_dc);
 	clReleaseMemObject(block_diff_ac);
 
-	ocl.releaseMemChannels(mask);
-	ocl.releaseMemChannels(mask_dc);
-
 	clReleaseMemObject(mem_result);
 }
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 583e37e0..a287c8cc 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -32,9 +32,8 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	float* r2, float* g2, float* b2,
 	size_t xsize, size_t ysize,
-	size_t res_xsize, size_t res_ysize,
 	size_t step,
-	float* result, size_t result_len);
+	float* result);
 
 void clCombineChannelsEx(
 	ocl_channels mask,
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index aac82f31..b9ada586 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -57,8 +57,8 @@ enum KernelName {
 	KERNEL_MASKHIGHINTENSITYCHANGE,
 	KERNEL_DIFFPRECOMPUTE,
 	KERNEL_UPSAMPLESQUAREROOT,
-	KERNEL_CALCULATEDIFFMAPGETBLURRED,
-	KERNEL_GETDIFFMAPFROMBLURRED,
+	KERNEL_ADDBORDER,
+	KERNEL_REMOVEBORDER,
 	KERNEL_AVERAGEADDIMAGE,
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 3026fb04..d4388976 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -193,6 +193,7 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClInclude Include="clguetzli\clBlock.h" />
     <ClInclude Include="clguetzli\clguetzli.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\ocl.h" />
@@ -288,6 +289,7 @@
     <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="clguetzli\clBlock.cpp" />
     <ClCompile Include="clguetzli\clguetzli.cpp" />
     <ClCompile Include="clguetzli\clguetzli_test.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index a74b94c9..0f876fd4 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -306,6 +306,9 @@
     <ClInclude Include="clguetzli\clguetzli_test.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\clBlock.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -569,6 +572,9 @@
     <ClCompile Include="clguetzli\clguetzli_test.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\clBlock.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index 098341e3..775b2fd7 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -66,7 +66,7 @@ class ButteraugliComparator : public Comparator {
       int factor_y, const std::vector<float>& distmap,
       std::vector<float>* block_weight) override;
 
- private:
+ protected:
   const int width_;
   const int height_;
   const float target_distance_;
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index b6057f5e..1637284d 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "guetzli/butteraugli_comparator.h"
+#include "clguetzli\clBlock.h"
 #include "guetzli/comparator.h"
 #include "guetzli/debug_print.h"
 #include "guetzli/fast_log.h"
@@ -407,12 +408,6 @@ void Processor::ComputeBlockZeroingOrder(
 
 	std::vector<std::pair<int, float> > input_order;
 	func(block, orig_block, comp_mask, params_, input_order);
-    if (input_order.size() > 10)
-    {
-        int i = 0;
-        i++;
-    }
-
 
 	coeff_t processed_block[kBlockSize];
 	memcpy(processed_block, block, sizeof(processed_block));
@@ -439,6 +434,7 @@ void Processor::ComputeBlockZeroingOrder(
       }
 
       float max_err = 0;
+
       for (int iy = 0; iy < factor_y; ++iy) {
         for (int ix = 0; ix < factor_x; ++ix) {
           int block_xx = block_x * factor_x + ix;
@@ -450,6 +446,8 @@ void Processor::ComputeBlockZeroingOrder(
         }
       }
 
+	  /*max_err = */((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block);
+
       if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
         best_err = max_err;
         best_i = i;
@@ -928,10 +926,10 @@ bool Process(const Params& params, ProcessStats* stats,
   if (stats == nullptr) {
     stats = &dummy_stats;
   }
-  std::unique_ptr<ButteraugliComparator> comparator;
+  std::unique_ptr<ButteraugliComparatorEx> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
     comparator.reset(
-        new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+        new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index fdc1f49a..6fd2d281 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1177,11 +1177,10 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
 	{
 		result.resize(xsize_ * ysize_);
 		clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(),
-			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, res_xsize_, res_ysize_, result.data(), result.size());
+			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
 		return;
 	}
 
-
   if (xsize_ < 8 || ysize_ < 8) return;
   auto xyb0 = xyb0_arg;
   {

From 389777fbfd7da6647e655536f4fcca627fc46b0b Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 12 May 2017 14:15:45 +0800
Subject: [PATCH 062/189] =?UTF-8?q?fix-mapbuffer=E9=95=BF=E5=BA=A6?=
 =?UTF-8?q?=E5=92=8C=E9=9C=80=E8=A6=81=E7=9A=84=E4=B8=8D=E7=AC=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 38d31785..5fe8da6d 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1099,7 +1099,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
-	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err);
+	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 	memcpy(result, result_r, channel_size);
 

From aaddc932966deed60f1423715a11fc1219eef8f4 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 12 May 2017 17:15:31 +0800
Subject: [PATCH 063/189] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20clButteraugliCompa?=
 =?UTF-8?q?rator=EF=BC=8C=E9=81=BF=E5=85=8D=E5=AF=B9=E7=AC=AC=E4=B8=89?=
 =?UTF-8?q?=E6=96=B9=E5=BA=93=E4=BB=A3=E7=A0=81=E7=A0=B4=E5=9D=8F=E5=A4=AA?=
 =?UTF-8?q?=E5=A4=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp             | 283 ++++++++++++++++++
 clguetzli/clbutter_comparator.h               |  86 ++++++
 .../{clBlock.cpp => clguetzli_comparator.cpp} |   2 +-
 .../{clBlock.h => clguetzli_comparator.h}     |   3 +-
 guetzli.vcxproj                               |   6 +-
 guetzli.vcxproj.filters                       |  10 +-
 guetzli/butteraugli_comparator.h              |   3 +-
 guetzli/processor.cc                          |   2 +-
 .../butteraugli/butteraugli/butteraugli.cc    | 231 +-------------
 .../butteraugli/butteraugli/butteraugli.h     |  12 +-
 10 files changed, 404 insertions(+), 234 deletions(-)
 create mode 100644 clguetzli/clbutter_comparator.cpp
 create mode 100644 clguetzli/clbutter_comparator.h
 rename clguetzli/{clBlock.cpp => clguetzli_comparator.cpp} (99%)
 rename clguetzli/{clBlock.h => clguetzli_comparator.h} (91%)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
new file mode 100644
index 00000000..650e4373
--- /dev/null
+++ b/clguetzli/clbutter_comparator.cpp
@@ -0,0 +1,283 @@
+#include "clbutter_comparator.h"
+#include "clguetzli.h"
+#include "clguetzli_test.h"
+
+namespace butteraugli
+{
+    clButteraugliComparator::clButteraugliComparator(size_t xsize, size_t ysize, int step)
+        : ButteraugliComparator(xsize, ysize, step)
+    {
+
+    }
+
+    void clButteraugliComparator::DiffmapOpsinDynamicsImage(const std::vector<std::vector<float>> &xyb0,
+        std::vector<std::vector<float>> &xyb1,
+        std::vector<float> &result)
+    {
+        if (g_useOpenCL && xsize_ > 100 && ysize_ > 100)
+        {
+            result.resize(xsize_ * ysize_);
+            clDiffmapOpsinDynamicsImage(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
+        }
+        else
+        {
+            ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result);
+        }
+    }
+
+    void clButteraugliComparator::BlockDiffMap(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        std::vector<float>* block_diff_dc,
+        std::vector<float>* block_diff_ac)
+    {
+        ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac);
+
+        if (g_checkOpenCL)
+        {
+            tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize_, ysize_, step_,
+                (*block_diff_dc).data(), (*block_diff_ac).data());
+        }
+    }
+
+
+    void clButteraugliComparator::EdgeDetectorMap(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        std::vector<float>* edge_detector_map)
+    {
+        ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map);
+
+        if (g_checkOpenCL)
+        {
+            tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize_, ysize_, step_,
+                (*edge_detector_map).data());
+        }
+    }
+
+    void clButteraugliComparator::EdgeDetectorLowFreq(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        std::vector<float>* block_diff_ac)
+    {
+        std::vector<float> orign_ac;
+        if (g_checkOpenCL)
+        {
+            orign_ac = *block_diff_ac;
+        }
+
+        ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
+
+        if (g_checkOpenCL)
+        {
+            tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize_, ysize_, step_,
+                orign_ac.data(), (*block_diff_ac).data());
+        }
+    }
+
+    void clButteraugliComparator::CombineChannels(const std::vector<std::vector<float> >& mask_xyb,
+        const std::vector<std::vector<float> >& mask_xyb_dc,
+        const std::vector<float>& block_diff_dc,
+        const std::vector<float>& block_diff_ac,
+        const std::vector<float>& edge_detector_map,
+        std::vector<float>* result)
+    {
+        std::vector<float> temp;
+        if (g_checkOpenCL)
+        {
+            temp = *result;
+        }
+
+        ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
+
+        if (g_checkOpenCL)
+        {
+            tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
+                mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
+                block_diff_dc.data(),
+                block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]);
+        }
+    }
+
+    void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values)
+    {
+        std::vector<float> img;
+        if (g_checkOpenCL)
+        {
+            img.resize(xsize * ysize);
+            memcpy(img.data(), values, xsize * ysize * sizeof(float));
+        }
+
+        _MinSquareVal(square_size, offset, xsize, ysize, values);
+
+
+        if (g_checkOpenCL)
+        {
+            tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
+        }
+    }
+
+    void Average5x5(int xsize, int ysize, std::vector<float>* diffs)
+    {
+        std::vector<float> diffs_org;
+        if (g_checkOpenCL)
+        {
+            diffs_org = *diffs;
+        }
+
+        _Average5x5(xsize, ysize, diffs);
+
+        if (g_checkOpenCL)
+        {
+            tclAverage5x5(xsize, ysize, diffs_org, *diffs);
+        }
+    }
+
+    void DiffPrecompute(const std::vector<std::vector<float> > &xyb0, const std::vector<std::vector<float> > &xyb1, size_t xsize, size_t ysize, std::vector<std::vector<float> > *mask)
+    {
+        _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
+
+        if (g_checkOpenCL)
+        {
+            tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
+        }
+    }
+
+    void Mask(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > *mask,
+        std::vector<std::vector<float> > *mask_dc)
+    {
+        _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+
+        if (g_checkOpenCL)
+        {
+            tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize, ysize,
+                (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+        }
+    }
+
+    void CalculateDiffmap(const size_t xsize, const size_t ysize,
+        const size_t step,
+        std::vector<float>* diffmap)
+    {
+        std::vector<float> diffmap_org;
+        if (g_checkOpenCL)
+        {
+            diffmap_org = *diffmap;
+        }
+
+        _CalculateDiffmap(xsize, ysize, step, diffmap);
+
+        if (g_checkOpenCL)
+        {
+            tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
+        }
+    }
+
+    void MaskHighIntensityChange(
+        size_t xsize, size_t ysize,
+        const std::vector<std::vector<float> > &c0,
+        const std::vector<std::vector<float> > &c1,
+        std::vector<std::vector<float> > &xyb0,
+        std::vector<std::vector<float> > &xyb1)
+    {
+        _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
+
+        if (g_checkOpenCL)
+        {
+            tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
+                c1[0].data(), c1[1].data(), c1[2].data(),
+                xsize, ysize,
+                xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
+        }
+    }
+
+    void ScaleImage(double scale, std::vector<float> *result)
+    {
+        std::vector<float> result_org;
+        if (g_checkOpenCL)
+        {
+            result_org = *result;
+        }
+
+        _ScaleImage(scale, result);
+
+        if (g_checkOpenCL)
+        {
+            tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
+        }
+    }
+
+    void Convolution(size_t xsize, size_t ysize,
+        size_t xstep,
+        size_t len, size_t offset,
+        const float* __restrict__ multipliers,
+        const float* __restrict__ inp,
+        float border_ratio,
+        float* __restrict__ result)
+    {
+        _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+
+        if (g_checkOpenCL)
+        {
+            tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
+        }
+    }
+
+    void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+        double border_ratio)
+    {
+        std::vector<float> orignChannel;
+        if (g_checkOpenCL)
+        {
+            orignChannel.resize(xsize * ysize);
+            memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
+        }
+
+        _Blur(xsize, ysize, channel, sigma, border_ratio);
+
+        if (g_checkOpenCL)
+        {
+            tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
+        }
+    }
+
+    void OpsinDynamicsImage(size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > &rgb)
+    {
+        if (g_useOpenCL && xsize > 100 && ysize > 100)
+        {
+            float * r = rgb[0].data();
+            float * g = rgb[1].data();
+            float * b = rgb[2].data();
+
+            clOpsinDynamicsImage(xsize, ysize, r, g, b);
+        }
+        else
+        {
+            std::vector< std::vector<float>> orig_rgb;
+            if (g_checkOpenCL)
+            {
+                orig_rgb = rgb;
+            }
+
+            _OpsinDynamicsImage(xsize, ysize, rgb);
+
+            if (g_checkOpenCL)
+            {
+                tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize,
+                    rgb[0].data(), rgb[1].data(), rgb[2].data());
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h
new file mode 100644
index 00000000..eb2e4e32
--- /dev/null
+++ b/clguetzli/clbutter_comparator.h
@@ -0,0 +1,86 @@
+#pragma once
+#include <vector>
+#include "butteraugli\butteraugli.h"
+
+#define __restrict__
+
+namespace butteraugli {
+
+    class clButteraugliComparator : public ButteraugliComparator
+    {
+    public:
+        clButteraugliComparator(size_t xsize, size_t ysize, int step);
+
+        virtual void DiffmapOpsinDynamicsImage(const std::vector<std::vector<float>> &xyb0,
+            std::vector<std::vector<float>> &xyb1,
+            std::vector<float> &result);
+
+        virtual void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
+            const std::vector<std::vector<float> > &rgb1,
+            std::vector<float>* block_diff_dc,
+            std::vector<float>* block_diff_ac);
+
+
+        virtual void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
+            const std::vector<std::vector<float> > &rgb1,
+            std::vector<float>* edge_detector_map);
+
+        virtual void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
+            const std::vector<std::vector<float> > &rgb1,
+            std::vector<float>* block_diff_ac);
+
+        virtual void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
+            const std::vector<std::vector<float> >& scale_xyb_dc,
+            const std::vector<float>& block_diff_dc,
+            const std::vector<float>& block_diff_ac,
+            const std::vector<float>& edge_detector_map,
+            std::vector<float>* result);
+    };
+
+    void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values);
+    void _Average5x5(int xsize, int ysize, std::vector<float>* diffs);
+    void _DiffPrecompute(const std::vector<std::vector<float> > &xyb0, const std::vector<std::vector<float> > &xyb1, size_t xsize, size_t ysize, std::vector<std::vector<float> > *mask);
+    void _Mask(const std::vector<std::vector<float> > &xyb0,
+        const std::vector<std::vector<float> > &xyb1,
+        size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > *mask,
+        std::vector<std::vector<float> > *mask_dc);
+    void _CalculateDiffmap(const size_t xsize, const size_t ysize,
+        const size_t step,
+        std::vector<float>* diffmap);
+    void _OpsinDynamicsImage(size_t xsize, size_t ysize,
+        std::vector<std::vector<float> > &rgb);
+    void _MaskHighIntensityChange(
+        size_t xsize, size_t ysize,
+        const std::vector<std::vector<float> > &c0,
+        const std::vector<std::vector<float> > &c1,
+        std::vector<std::vector<float> > &xyb0,
+        std::vector<std::vector<float> > &xyb1);
+    void _ScaleImage(double scale, std::vector<float> *result);
+    void _Convolution(size_t xsize, size_t ysize,
+        size_t xstep,
+        size_t len, size_t offset,
+        const float* __restrict__ multipliers,
+        const float* __restrict__ inp,
+        float border_ratio,
+        float* __restrict__ result);
+    void _Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+        double border_ratio);
+
+    void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values);
+    void Average5x5(int xsize, int ysize, std::vector<float>* diffs);
+    void DiffPrecompute(const std::vector<std::vector<float> > &xyb0, const std::vector<std::vector<float> > &xyb1, size_t xsize, size_t ysize, std::vector<std::vector<float> > *mask);
+    void ScaleImage(double scale, std::vector<float> *result);
+    void Convolution(size_t xsize, size_t ysize,
+        size_t xstep,
+        size_t len, size_t offset,
+        const float* __restrict__ multipliers,
+        const float* __restrict__ inp,
+        float border_ratio,
+        float* __restrict__ result);
+    void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+        double border_ratio);
+    void CalculateDiffmap(const size_t xsize, const size_t ysize,
+        const size_t step,
+        std::vector<float>* diffmap);
+}
\ No newline at end of file
diff --git a/clguetzli/clBlock.cpp b/clguetzli/clguetzli_comparator.cpp
similarity index 99%
rename from clguetzli/clBlock.cpp
rename to clguetzli/clguetzli_comparator.cpp
index 4650a813..ce3a9b64 100644
--- a/clguetzli/clBlock.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -1,6 +1,6 @@
 #include <stdint.h>
 #include <algorithm>
-#include "clBlock.h"
+#include "clguetzli_comparator.h"
 #include "guetzli\idct.h"
 
 
diff --git a/clguetzli/clBlock.h b/clguetzli/clguetzli_comparator.h
similarity index 91%
rename from clguetzli/clBlock.h
rename to clguetzli/clguetzli_comparator.h
index a3c91e71..778d0532 100644
--- a/clguetzli/clBlock.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -11,8 +11,9 @@ namespace guetzli {
 			const std::vector<uint8_t>* rgb,
 			const float target_distance, ProcessStats* stats);
 
-		void StartBlockComparisons();
+        //void Compare(const OutputImage& img) override;
 
+		void StartBlockComparisons();
 		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y);
 
 		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index d4388976..3ae4554f 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -193,8 +193,9 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClInclude Include="clguetzli\clBlock.h" />
+    <ClInclude Include="clguetzli\clbutter_comparator.h" />
     <ClInclude Include="clguetzli\clguetzli.h" />
+    <ClInclude Include="clguetzli\clguetzli_comparator.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\ocl.h" />
     <ClInclude Include="clguetzli\utils.h" />
@@ -289,8 +290,9 @@
     <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="clguetzli\clBlock.cpp" />
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp" />
     <ClCompile Include="clguetzli\clguetzli.cpp" />
+    <ClCompile Include="clguetzli\clguetzli_comparator.cpp" />
     <ClCompile Include="clguetzli\clguetzli_test.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
     <ClCompile Include="clguetzli\utils.cpp" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 0f876fd4..9b0a7ad0 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -306,7 +306,10 @@
     <ClInclude Include="clguetzli\clguetzli_test.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
-    <ClInclude Include="clguetzli\clBlock.h">
+    <ClInclude Include="clguetzli\clbutter_comparator.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli_comparator.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
   </ItemGroup>
@@ -572,7 +575,10 @@
     <ClCompile Include="clguetzli\clguetzli_test.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
-    <ClCompile Include="clguetzli\clBlock.cpp">
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli_comparator.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
   </ItemGroup>
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index 775b2fd7..0136f2bb 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "butteraugli/butteraugli.h"
+#include "clguetzli\clbutter_comparator.h"
 #include "guetzli/comparator.h"
 #include "guetzli/jpeg_data.h"
 #include "guetzli/output_image.h"
@@ -78,7 +79,7 @@ class ButteraugliComparator : public Comparator {
   std::vector<std::vector<float>> rgb_linear_pregamma_;
   std::vector<std::vector<float>> mask_xyz_;
   std::vector<std::vector<std::vector<float>>> per_block_pregamma_;
-  ::butteraugli::ButteraugliComparator comparator_;
+  ::butteraugli::clButteraugliComparator comparator_;
   float distance_;
   std::vector<float> distmap_;
   ProcessStats* stats_;
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 1637284d..62613d04 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -22,7 +22,7 @@
 #include <vector>
 
 #include "guetzli/butteraugli_comparator.h"
-#include "clguetzli\clBlock.h"
+#include "clguetzli\clguetzli_comparator.h"
 #include "guetzli/comparator.h"
 #include "guetzli/debug_print.h"
 #include "guetzli/fast_log.h"
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 6fd2d281..73b78a05 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -30,6 +30,7 @@
 //   * Blur - to hold the smoothing code
 
 #include "butteraugli/butteraugli.h"
+#include "clguetzli\clbutter_comparator.h"
 
 #include <assert.h>
 #include <math.h>
@@ -62,7 +63,7 @@ inline double DotProduct(const float u[3], const double v[3]) {
 }
 
 // Computes a horizontal convolution and transposes the result.
-static void Convolution(size_t xsize, size_t ysize,
+void _Convolution(size_t xsize, size_t ysize,
 	size_t xstep,
 	size_t len, size_t offset,
 	const float* __restrict__ multipliers,
@@ -93,23 +94,10 @@ static void Convolution(size_t xsize, size_t ysize,
       result[ox * ysize + y] = static_cast<float>(sum * scale);
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
-  }
 }
 
-void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
+void _Blur(size_t xsize, size_t ysize, float* channel, double sigma,
           double border_ratio) {
-
-    std::vector<float> orignChannel;
-	if (g_checkOpenCL)
-	{
-		orignChannel.resize(xsize * ysize);
-		memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
-	}
-
   PROFILER_FUNC;
   double m = 2.25;  // Accuracy increases when m is increased.
   const double scaler = -1.0 / (2 * sigma * sigma);
@@ -144,15 +132,6 @@ void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
             downsampled_output[(y / ystep) * dxsize + (x / xstep)];
       }
     }
-	if (g_checkOpenCL)
-	{
-		tclUpsample(downsampled_output.data(), xsize, ysize, xstep, ystep, channel);
-	}
-  }
-
-  if (g_checkOpenCL)
-  {
-	  tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
   }
 }
 
@@ -798,7 +777,7 @@ ButteraugliComparator::ButteraugliComparator(
   assert(step <= 4);
 }
 
-void MaskHighIntensityChange(
+void _MaskHighIntensityChange(
     size_t xsize, size_t ysize,
     const std::vector<std::vector<float> > &c0,
     const std::vector<std::vector<float> > &c1,
@@ -850,15 +829,6 @@ void MaskHighIntensityChange(
       }
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
-		  c1[0].data(), c1[1].data(), c1[2].data(),
-		  xsize, ysize,
-		  xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
-  }
 }
 
 double SimpleGamma(double v) {
@@ -936,28 +906,6 @@ struct RationalPolynomial {
     return b1;
   }
 
-#ifdef ENABLE_OPENCL_CHECK
-  static double EvaluatePolynomialNonRecursion(const double x, const double *coefficients, int n) {
-	double b1 = 0.0;
-	double b2 = 0.0;
-
-	for (int i = n - 1; i >= 0; i--)
-	{
-		if (i == 0) {
-			const double x_b1 = x * b1;
-			b1 = x_b1 - b2 + coefficients[0];
-			break;
-		}
-		const double x_b1 = x * b1;
-		const double t = (x_b1 + x_b1) - b2 + coefficients[i];
-		b2 = b1;
-		b1 = t;
-	}
-
-	return b1;
-  }
-#endif // ENABLE_OPENCL_CHECK
-
   // Evaluates the polynomial at x (in [min_value, max_value]).
   inline double operator()(const float x) const {
     // First normalize to [0, 1].
@@ -996,56 +944,13 @@ static inline float GammaPolynomial(float value) {
   return static_cast<float>(r(value));
 }
 
-#ifdef ENABLE_OPENCL_CHECK
-static double GammaNonRecursion(double v) {
-	double min_value = 0.770000000000000;
-	double max_value = 274.579999999999984;
-
-	double p[5 + 1] = {
-		881.979476556478289, 1496.058452015812463, 908.662212739659481,
-		373.566100223287378, 85.840860336314364, 6.683258861509244,
-	};
-	double q[5 + 1] = {
-		12.262350348616792, 20.557285797683576, 12.161463238367844,
-		4.711532733641639, 0.899112889751053, 0.035662329617191,
-	};
-
-	// First normalize to [0, 1].
-	const double x01 = (v - min_value) / (max_value - min_value);
-	// And then to [-1, 1] domain of Chebyshev polynomials.
-	const double xc = 2.0 * x01 - 1.0;
-
-	const double yp = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, p, 6);
-	const double yq = RationalPolynomial::EvaluatePolynomialNonRecursion(xc, q, 6);
-	if (yq == 0.0) return 0.0;
-	return static_cast<float>(yp / yq);
-}
-#endif // ENABLE_OPENCL_CHECK
-
 static inline double Gamma(double v) {
   // return SimpleGamma(v);
   return GammaPolynomial(static_cast<float>(v));
 }
 
-void OpsinDynamicsImage(size_t xsize, size_t ysize,
+void _OpsinDynamicsImage(size_t xsize, size_t ysize,
                         std::vector<std::vector<float> > &rgb) {
-
-    if (g_useOpenCL && xsize > 100 && ysize > 100)
-    {
-        float * r = rgb[0].data();
-        float * g = rgb[1].data();
-        float * b = rgb[2].data();
-
-        clOpsinDynamicsImage(xsize, ysize, r, g, b);
-        return;
-    }
-
-	std::vector< std::vector<float>> orig_rgb;
-	if (g_checkOpenCL)
-	{
-		orig_rgb = rgb;
-	}
-
   PROFILER_FUNC;
   std::vector<std::vector<float> > blurred = rgb;
   static const double kSigma = 1.1;
@@ -1075,41 +980,20 @@ void OpsinDynamicsImage(size_t xsize, size_t ysize,
     rgb[1][i] = static_cast<float>(y);
     rgb[2][i] = static_cast<float>(z);
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize,
-		  rgb[0].data(), rgb[1].data(), rgb[2].data());
-  }
 }
 
-static void ScaleImage(double scale, std::vector<float> *result) {
-  std::vector<float> result_org;
-	if (g_checkOpenCL)
-	{
-    result_org = *result;
-	}
+void _ScaleImage(double scale, std::vector<float> *result) {
   PROFILER_FUNC;
   for (size_t i = 0; i < result->size(); ++i) {
     (*result)[i] *= static_cast<float>(scale);
   }
-
-  if (g_checkOpenCL)
-  {
-    tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
-  }
 }
 
 // Making a cluster of local errors to be more impactful than
 // just a single error.
-void CalculateDiffmap(const size_t xsize, const size_t ysize,
+void _CalculateDiffmap(const size_t xsize, const size_t ysize,
                       const size_t step,
                       std::vector<float>* diffmap) {
-  std::vector<float> diffmap_org;
-  if (g_checkOpenCL)
-  {
-	  diffmap_org = *diffmap;
-  }
   PROFILER_FUNC;
   // Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5
   // pixels distance over the original image. The border of 2 pixels on top and
@@ -1162,10 +1046,6 @@ void CalculateDiffmap(const size_t xsize, const size_t ysize,
     }
     ScaleImage(scale, diffmap);
   }
-  if (g_checkOpenCL)
-  {
-	  tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
-  }
 }
 
 void ButteraugliComparator::DiffmapOpsinDynamicsImage(
@@ -1173,14 +1053,6 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
     std::vector<std::vector<float>> &xyb1,
     std::vector<float> &result) {
 
-	if (g_useOpenCL && xsize_ > 100 && ysize_ > 100)
-	{
-		result.resize(xsize_ * ysize_);
-		clDiffmapOpsinDynamicsImage(xyb0_arg[0].data(), xyb0_arg[1].data(), xyb0_arg[2].data(),
-			xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
-		return;
-	}
-
   if (xsize_ < 8 || ysize_ < 8) return;
   auto xyb0 = xyb0_arg;
   {
@@ -1244,14 +1116,6 @@ void ButteraugliComparator::BlockDiffMap(
       }
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-		  xsize_, ysize_, step_,
-		  (*block_diff_dc).data(), (*block_diff_ac).data());
-  }
 }
 
 void ButteraugliComparator::EdgeDetectorMap(
@@ -1284,14 +1148,6 @@ void ButteraugliComparator::EdgeDetectorMap(
       }
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-		  xsize_, ysize_, step_,
-		  (*edge_detector_map).data());
-  }
 }
 
 void ButteraugliComparator::EdgeDetectorLowFreq(
@@ -1299,12 +1155,6 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
     const std::vector<std::vector<float> > &xyb1,
     std::vector<float>* block_diff_ac) {
 
-	std::vector<float> orign_ac;
-	if (g_checkOpenCL)
-	{
-		orign_ac = *block_diff_ac;
-	}
-
   PROFILER_FUNC;
   static const double kSigma = 14;
   static const double kMul = 10;
@@ -1355,14 +1205,6 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
       }
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-		  xsize_, ysize_, step_,
-		  orign_ac.data(), (*block_diff_ac).data());
-  }
 }
 
 void ButteraugliComparator::CombineChannels(
@@ -1376,12 +1218,6 @@ void ButteraugliComparator::CombineChannels(
   PROFILER_FUNC;
   result->resize(res_xsize_ * res_ysize_);
 
-  std::vector<float> temp;
-  if (g_checkOpenCL)
-  {
-	  temp = *result;
-  }
-
   for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
     for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) {
       size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
@@ -1397,14 +1233,6 @@ void ButteraugliComparator::CombineChannels(
            DotProduct(&edge_detector_map[3 * res_ix], mask));
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
-		  mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
-		  block_diff_dc.data(),
-		  block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]);
-  }
 }
 
 double ButteraugliScoreFromDiffmap(const std::vector<float>& diffmap) {
@@ -1502,20 +1330,13 @@ double MaskDcB(double delta) {
   return InterpolateClampNegative(lut.data(), lut.size(), delta);
 }
 
-void MinSquareVal(size_t square_size, size_t offset,
+void _MinSquareVal(size_t square_size, size_t offset,
 				  size_t xsize, size_t ysize,
                   float *values) {
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
 
-  std::vector<float> img;
-  if (g_checkOpenCL)
-  {
-	  img.resize(xsize * ysize);
-	  memcpy(img.data(), values, xsize * ysize * sizeof(float));
-  }
-
   for (size_t y = 0; y < ysize; ++y) {
     const size_t minh = offset > y ? 0 : y - offset;
     const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
@@ -1552,21 +1373,10 @@ void MinSquareVal(size_t square_size, size_t offset,
         *pValuePoint = min; pValuePoint += xsize;
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
-  }
 }
 
 // ===== Functions used by Mask only =====
-void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
-  std::vector<float> diffs_org;
-  if (g_checkOpenCL)
-  {
-    diffs_org = *diffs;
-  }
-
+void _Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   PROFILER_FUNC;
   if (xsize < 4 || ysize < 4) {
     // TODO: Make this work for small dimensions as well.
@@ -1617,14 +1427,9 @@ void Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   }
   *diffs = result;
   ScaleImage(scale, diffs);
-
-  if (g_checkOpenCL)
-  {
-	  tclAverage5x5(xsize, ysize, diffs_org, *diffs);
-  }
 }
 
-void DiffPrecompute(
+void _DiffPrecompute(
     const std::vector<std::vector<float> > &xyb0,
     const std::vector<std::vector<float> > &xyb1,
     size_t xsize, size_t ysize,
@@ -1677,14 +1482,9 @@ void DiffPrecompute(
       }
     }
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
-  }
 }
 
-void Mask(const std::vector<std::vector<float> > &xyb0,
+void _Mask(const std::vector<std::vector<float> > &xyb0,
           const std::vector<std::vector<float> > &xyb1,
           size_t xsize, size_t ysize,
           std::vector<std::vector<float> > *mask,
@@ -1735,15 +1535,6 @@ void Mask(const std::vector<std::vector<float> > &xyb0,
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]);
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
   }
-
-  if (g_checkOpenCL)
-  {
-	  tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-		  xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-		  xsize, ysize,
-		  (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
-		  (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
-  }
 }
 
 }  // namespace butteraugli
diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h
index eeb91084..637f50ff 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.h
+++ b/third_party/butteraugli/butteraugli/butteraugli.h
@@ -46,26 +46,26 @@ class ButteraugliComparator {
   // Computes the butteraugli map between xyb0 and xyb1 and updates result.
   // Both xyb0 and xyb1 are in opsin-dynamics space.
   // NOTE: The xyb1 image is mutated by this function in-place.
-  void DiffmapOpsinDynamicsImage(const std::vector<std::vector<float>> &xyb0,
+  virtual void DiffmapOpsinDynamicsImage(const std::vector<std::vector<float>> &xyb0,
                                  std::vector<std::vector<float>> &xyb1,
                                  std::vector<float> &result);
 
- private:
-  void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
+ protected:
+  virtual void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
                     const std::vector<std::vector<float> > &rgb1,
                     std::vector<float>* block_diff_dc,
                     std::vector<float>* block_diff_ac);
 
 
-  void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
+  virtual void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
                        const std::vector<std::vector<float> > &rgb1,
                        std::vector<float>* edge_detector_map);
 
-  void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
+  virtual void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
                            const std::vector<std::vector<float> > &rgb1,
                            std::vector<float>* block_diff_ac);
 
-  void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
+  virtual void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
                        const std::vector<std::vector<float> >& scale_xyb_dc,
                        const std::vector<float>& block_diff_dc,
                        const std::vector<float>& block_diff_ac,

From 36905d7e7ccd40c0bef71951887e4b345f39ca25 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 12 May 2017 21:24:47 +0800
Subject: [PATCH 064/189] =?UTF-8?q?=E8=A7=84=E8=8C=83kernel=E5=87=BD?=
 =?UTF-8?q?=E6=95=B0=E5=90=8D=E4=BB=A5cl=E5=BC=80=E5=A4=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  | 1613 ++++++++++++++++++++-------------------
 clguetzli/clguetzli.cpp |   38 +-
 clguetzli/clguetzli.h   |   16 +-
 3 files changed, 849 insertions(+), 818 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 308ef1d3..d44f5c07 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -6,461 +6,702 @@
 //#error "Double precision floating point not supported by OpenCL implementation."
 //#endif
 
-__kernel void MinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
+#define kBlockEdge 8
+#define kBlockSize (kBlockEdge * kBlockEdge)
+#define kBlockEdgeHalf  (kBlockEdge / 2)
+#define kBlockHalf (kBlockEdge * kBlockEdgeHalf)
+
+void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
+double InterpolateClampNegative(__global const double *array, int size, double sx);
+void   XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+                                       double r1, double g1, double b1,
+                                       double factor, double res[3]);
+double DotProduct(__global float u[3], double v[3]);
+void   OpsinAbsorbance(const double in[3], double out[3]);
+void   RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz);
+double Gamma(double v);
+void   ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
+    __private double xyb1[3 * kBlockSize],
+    double diff_xyb_dc[3],
+    double diff_xyb_ac[3],
+    double diff_xyb_edge_dc[3]);
+void Butteraugli8x8CornerEdgeDetectorDiff(
+    int pos_x,
+    int pos_y,
+    int xsize,
+    int ysize,
+    __global float *r, __global float *g, __global float* b,
+    __global float *r2, __global float* g2, __global float *b2,
+    double* diff_xyb);
+
+__kernel void clOpsinDynamicsImage(
+    __global float *r, __global float *g, __global float *b,
+    __global float *r_blurred, __global float *g_blurred, __global float *b_blurred,
+    int size)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-	const int width = get_global_size(0);
-	const int height = get_global_size(1);
+    const int i = get_global_id(0);
+    double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
+    double pre_mixed[3];
+    OpsinAbsorbance(pre, pre_mixed);
+
+    double sensitivity[3];
+    sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+    sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+    sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+
+    double cur_rgb[3] = { r[i], g[i],  b[i] };
+    double cur_mixed[3];
+    OpsinAbsorbance(cur_rgb, cur_mixed);
+    cur_mixed[0] *= sensitivity[0];
+    cur_mixed[1] *= sensitivity[1];
+    cur_mixed[2] *= sensitivity[2];
 
-	int minH = offset > y ? 0 : y - offset;
-	int maxH = min(y + square_size - offset, height);
+    double x, y, z;
+    RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+    r[i] = x;
+    g[i] = y;
+    b[i] = z;
+}
 
-	int minW = offset > x ? 0 : x - offset;
-	int maxW = min(x + square_size - offset, width);
+__kernel void clMinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int width = get_global_size(0);
+    const int height = get_global_size(1);
 
-	float minValue = pA[minH * width + minW];
+    int minH = offset > y ? 0 : y - offset;
+    int maxH = min(y + square_size - offset, height);
 
-	for (int j = minH; j < maxH; j++)
-	{
-		for (int i = minW; i < maxW; i++)
-		{
-			float tmp = pA[j * width + i];
-			if (tmp < minValue) minValue = tmp;
-		}
-	}
+    int minW = offset > x ? 0 : x - offset;
+    int maxW = min(x + square_size - offset, width);
 
-	pC[y * width + x] = minValue;
+    float minValue = pA[minH * width + minW];
+
+    for (int j = minH; j < maxH; j++)
+    {
+        for (int i = minW; i < maxW; i++)
+        {
+            float tmp = pA[j * width + i];
+            if (tmp < minValue) minValue = tmp;
+        }
+    }
+
+    pC[y * width + x] = minValue;
 }
 
-__kernel void ConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
-	int step, int len, int offset, float border_ratio)
+__kernel void clConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
+    int step, int len, int offset, float border_ratio)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-	if (x % step != 0) return;
+    if (x % step != 0) return;
 
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-	float weight_no_border = 0;
-	for (int j = 0; j <= 2 * offset; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
 
-	int minx = x < offset ? 0 : x - offset;
-	int maxx = min(xsize, x + len - offset);
+    int minx = x < offset ? 0 : x - offset;
+    int maxx = min(xsize, x + len - offset);
 
-	float weight = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		weight += multipliers[j - x + offset];
-	}
+    float weight = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        weight += multipliers[j - x + offset];
+    }
 
-	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-	float scale = 1.0 / weight;
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
 
-	float sum = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		sum += inp[y * xsize + j] * multipliers[j - x + offset];
-	}
+    float sum = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        sum += inp[y * xsize + j] * multipliers[j - x + offset];
+    }
 
-	result[y * xsize + x] = sum * scale;
+    result[y * xsize + x] = sum * scale;
 }
 
-__kernel void ConvolutionY(__global float* multipliers, __global float* inp, __global float* result,
-	int step, int len, int offset, float border_ratio)
+__kernel void clConvolutionY(__global float* multipliers, __global float* inp, __global float* result,
+    int step, int len, int offset, float border_ratio)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-	if (x % step != 0) return;
-	if (y % step != 0) return;
+    if (x % step != 0) return;
+    if (y % step != 0) return;
 
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-	float weight_no_border = 0;
-	for (int j = 0; j <= 2 * offset; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
 
-	int miny = y < offset ? 0 : y - offset;
-	int maxy = min(ysize, y + len - offset);
+    int miny = y < offset ? 0 : y - offset;
+    int maxy = min(ysize, y + len - offset);
 
-	float weight = 0.0;
-	for (int j = miny; j < maxy; j++)
-	{
-		weight += multipliers[j - y + offset];
-	}
+    float weight = 0.0;
+    for (int j = miny; j < maxy; j++)
+    {
+        weight += multipliers[j - y + offset];
+    }
 
-	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-	float scale = 1.0 / weight;
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
 
-	float sum = 0.0;
-	for (int j = miny; j < maxy; j++)
-	{
-		sum += inp[j * xsize + x] * multipliers[j - y + offset];
-	}
+    float sum = 0.0;
+    for (int j = miny; j < maxy; j++)
+    {
+        sum += inp[j * xsize + x] * multipliers[j - y + offset];
+    }
 
-	result[y * xsize + x] = sum * scale;
+    result[y * xsize + x] = sum * scale;
 }
 
-__kernel void Convolution(__global float* multipliers, __global float* inp, __global float* result,
-							int xsize, int xstep, int len, int offset, float border_ratio)
+__kernel void clConvolution(__global float* multipliers, __global float* inp, __global float* result,
+    int xsize, int xstep, int len, int offset, float border_ratio)
 {
-	const int ox = get_global_id(0);
-	const int y = get_global_id(1);
+    const int ox = get_global_id(0);
+    const int y = get_global_id(1);
 
-	const int oxsize = get_global_size(0);
-	const int ysize = get_global_size(1);
+    const int oxsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-	const int x = ox * xstep;
+    const int x = ox * xstep;
 
-	float weight_no_border = 0;
-	for (int j = 0; j <= 2 * offset; j++)
-	{
-		weight_no_border += multipliers[j];
-	}
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
 
-	int minx = x < offset ? 0 : x - offset;
-	int maxx = min(xsize, x + len - offset);
+    int minx = x < offset ? 0 : x - offset;
+    int maxx = min(xsize, x + len - offset);
 
-	float weight = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		weight += multipliers[j - x + offset];
-	}
+    float weight = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        weight += multipliers[j - x + offset];
+    }
 
-	weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-	float scale = 1.0 / weight;
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
 
-	float sum = 0.0;
-	for (int j = minx; j < maxx; j++)
-	{
-		sum += inp[y * xsize + j] * multipliers[j - x + offset];
-	}
+    float sum = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        sum += inp[y * xsize + j] * multipliers[j - x + offset];
+    }
 
-	result[ox * ysize + y] = sum * scale;
+    result[ox * ysize + y] = sum * scale;
 }
 
-__kernel void SquareSample(__global float* pA, __global float* pC, int xstep, int ystep)
+__kernel void clSquareSample(__global float* pA, __global float* pC, int xstep, int ystep)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-	int x_sample = x - x % xstep;
-	int y_sample = y - y % ystep;
+    int x_sample = x - x % xstep;
+    int y_sample = y - y % ystep;
 
-	if (x_sample == x && y_sample == y) return;
+    if (x_sample == x && y_sample == y) return;
 
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-	pC[y * xsize + x] = pA[y_sample * xsize + x_sample];
+    pC[y * xsize + x] = pA[y_sample * xsize + x_sample];
 }
 
-__kernel void DownSample(__global float* pA, __global float* pC, int xstep, int ystep)
+__kernel void clDownSample(__global float* pA, __global float* pC, int xstep, int ystep)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-	const int oxsize = (xsize + xstep - 1) / xstep;
+    const int oxsize = (xsize + xstep - 1) / xstep;
 
-	const int sample_x = x / xstep;
-	const int sample_y = y / ystep;
+    const int sample_x = x / xstep;
+    const int sample_y = y / ystep;
 
-	pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
+    pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
 }
 
-__constant 	float g_mix[12] = {
-	0.348036746003,
-	0.577814843137,
-	0.0544556093735,
-	0.774145581713,
-	0.26922717275,
-	0.767247733938,
-	0.0366922708552,
-	0.920130265014,
-	0.0882062883536,
-	0.158581714673,
-	0.712857943858,
-	10.6524069248,
-};
-
-void OpsinAbsorbance(const double in[3], double out[3])
+__kernel void clScaleImage(double scale, __global float *result)
 {
-	out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3];
-	out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7];
-	out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
+    const int i = get_global_id(0);
+    result[i] *= scale;
 }
 
-double EvaluatePolynomial(const double x, __constant const double *coefficients, int n)
+kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out)
 {
-	double b1 = 0.0;
-	double b2 = 0.0;
-
-	for (int i = n - 1; i >= 0; i--)
-	{
-		if (i == 0) {
-			const double x_b1 = x * b1;
-			b1 = x_b1 - b2 + coefficients[0];
-			break;
-		}
-		const double x_b1 = x * b1;
-		const double t = (x_b1 + x_b1) - b2 + coefficients[i];
-		b2 = b1;
-		b1 = t;
-	}
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-	return b1;
+    out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
+kernel void clAddBorder(__global float *out, int s, int s2, __global float *in)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-__constant double g_gamma_p[5 + 1] = {
-	881.979476556478289, 1496.058452015812463, 908.662212739659481,
-	373.566100223287378, 85.840860336314364, 6.683258861509244,
-};
-__constant double g_gamma_q[5 + 1] = {
-	12.262350348616792, 20.557285797683576, 12.161463238367844,
-	4.711532733641639, 0.899112889751053, 0.035662329617191,
-};
+    const double mul1 = 24.8235314874;
+    out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x];
 
-double Gamma(double v)
-{
-	const double min_value = 0.770000000000000;
-	const double max_value = 274.579999999999984;
-	const double x01 = (v - min_value) / (max_value - min_value);
-	const double xc = 2.0 * x01 - 1.0;
-
-	const double yp = EvaluatePolynomial(xc, g_gamma_p, 6);
-	const double yq = EvaluatePolynomial(xc, g_gamma_q, 6);
-	if (yq == 0.0) return 0.0;
-	return (float)(yp / yq);
 }
 
-void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
+__kernel void clCombineChannels(
+    __global float *mask_x, __global float *mask_y, __global float *mask_b,
+    __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
+    __global float *block_diff_dc,
+    __global float *block_diff_ac,
+    __global float *edge_detector_map,
+    int xsize, int ysize,
+    int res_xsize,
+    int step,
+    __global float *result)
 {
-	const double a0 = 1.01611726948;
-	const double a1 = 0.982482243696;
-	const double a2 = 1.43571362627;
-	const double a3 = 0.896039849412;
-	*valx = a0 * r - a1 * g;
-	*valy = a2 * r + a3 * g;
-	*valz = b;
+    const int res_x = get_global_id(0) * step;
+    const int res_y = get_global_id(1) * step;
+
+    double mask[3];
+    double dc_mask[3];
+    mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)];
+
+    mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
+
+    mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
+
+    size_t res_ix = (res_y * res_xsize + res_x) / step;
+    result[res_ix] = (float)(
+        DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
+        DotProduct(&block_diff_ac[3 * res_ix], mask) +
+        DotProduct(&edge_detector_map[3 * res_ix], mask));
 }
 
-__kernel void OpsinDynamicsImage(
-	__global float *r, __global float *g, __global float *b,
-	__global float *r_blurred, __global float *g_blurred, __global float *b_blurred,
-	int size)
+__kernel void clDiffPrecompute(
+    __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+    __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+    __global float *mask_x, __global float *mask_y, __global float *mask_b)
 {
-	const int i = get_global_id(0);
-	double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
-	double pre_mixed[3];
-	OpsinAbsorbance(pre, pre_mixed);
-
-	double sensitivity[3];
-	sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
-	sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
-	sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
-
-	double cur_rgb[3] = { r[i], g[i],  b[i] };
-	double cur_mixed[3];
-    OpsinAbsorbance(cur_rgb, cur_mixed);
-    cur_mixed[0] *= sensitivity[0];
-    cur_mixed[1] *= sensitivity[1];
-    cur_mixed[2] *= sensitivity[2];
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
+
+    double valsh0[3] = { 0.0 };
+    double valsv0[3] = { 0.0 };
+    double valsh1[3] = { 0.0 };
+    double valsv1[3] = { 0.0 };
+    int ix2;
+
+    int ix = x + xsize * y;
+    if (x + 1 < xsize) {
+        ix2 = ix + 1;
+    }
+    else {
+        ix2 = ix - 1;
+    }
+    {
+        double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
+        double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
+        double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
+        XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]);
+        double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
+        double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
+        double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
+        XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]);
+    }
+    if (y + 1 < ysize) {
+        ix2 = ix + xsize;
+    }
+    else {
+        ix2 = ix - xsize;
+    }
+    {
+        double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
+        double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
+        double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
+        XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]);
+        double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
+        double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
+        double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
+        XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]);
+    }
 
-    double x, y, z;
-	RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
-    r[i] = x;
-    g[i] = y;
-    b[i] = z;
+    double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]);
+    double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]);
+    double m = min(sup0, sup1);
+    mask_x[ix] = (float)(m);
+
+    sup0 = fabs(valsh0[1]) + fabs(valsv0[1]);
+    sup1 = fabs(valsh1[1]) + fabs(valsv1[1]);
+    m = min(sup0, sup1);
+    mask_y[ix] = (float)(m);
+
+    sup0 = fabs(valsh0[2]) + fabs(valsv0[2]);
+    sup1 = fabs(valsh1[2]) + fabs(valsv1[2]);
+    m = min(sup0, sup1);
+    mask_b[ix] = (float)(m);
 }
 
+__kernel void clEdgeDetectorMap(__global float *result,
+    __global float *r, __global float *g, __global float* b,
+    __global float *r2, __global float* g2, __global float *b2,
+    int xsize, int ysize, int step)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
 
-double InterpolateClampNegative(__global const double *array,
-	int size, double sx) {
-	if (sx < 0) {
-		sx = 0;
-	}
-	double ix = fabs(sx);
-	int baseix = (int)(ix);
-	double res;
-	if (baseix >= size - 1) {
-		res = array[size - 1];
-	}
-	else {
-		double mix = ix - baseix;
-		int nextix = baseix + 1;
-		res = array[baseix] + mix * (array[nextix] - array[baseix]);
-	}
-	return res;
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
+
+    int pos_x = res_x * step;
+    int pos_y = res_y * step;
+
+    if (pos_x >= xsize - (8 - step)) return;
+    if (pos_y >= ysize - (8 - step)) return;
+
+    pos_x = min(pos_x, xsize - 8);
+    pos_y = min(pos_y, ysize - 8);
+
+    double diff_xyb[3] = { 0.0 };
+    Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize,
+        r, g, b,
+        r2, g2, b2,
+        &diff_xyb[0]);
+
+    int idx = (res_y * res_xsize + res_x) * 3;
+    result[idx] = diff_xyb[0];
+    result[idx + 1] = diff_xyb[1];
+    result[idx + 2] = diff_xyb[2];
 }
 
-__kernel void DoMask(
-	__global float *mask_x, __global float *mask_y, __global float *mask_b,
-	__global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
-	__global double *lut_x, __global double *lut_y, __global double *lut_b,
-	__global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b)
+__kernel void clEdgeDetectorLowFreq(__global float *result,
+    __global float *r, __global float *g, __global float* b,
+    __global float *r2, __global float* g2, __global float *b2,
+    int xsize, int ysize, int step)
 {
-	const double w00 = 232.206464018;
-	const double w11 = 22.9455222245;
-	const double w22 = 503.962310606;
-
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	const size_t idx = y * xsize + x;
-	const double s0 = mask_x[idx];
-	const double s1 = mask_y[idx];
-	const double s2 = mask_b[idx];
-	const double p0 = w00 * s0;
-	const double p1 = w11 * s1;
-	const double p2 = w22 * s2;
-
-	mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0));
-	mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1));
-	mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2));
-	mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0));
-	mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1));
-	mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2));
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    if (res_x < 8 / step) return;
+
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
+
+    int pos_x = (res_x - (8 / step)) * step;
+    int pos_y = res_y * step;
+
+    if (pos_x + 8 >= xsize) return;
+    if (pos_y + 8 >= ysize) return;
+
+    int ix = pos_y * xsize + pos_x;
+
+    double diff[4][3];
+    __global float* blurred0[3] = { r, g, b };
+    __global float* blurred1[3] = { r2, g2, b2 };
+
+    for (int i = 0; i < 3; ++i) {
+        int ix2 = ix + 8;
+        diff[0][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 8 * xsize;
+        diff[1][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 6 * xsize + 6;
+        diff[2][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 6 * xsize - 6;
+        diff[3][i] = pos_x < 8 ? 0 :
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+    }
+    double max_diff_xyb[3] = { 0 };
+    for (int k = 0; k < 4; ++k) {
+        double diff_xyb[3] = { 0 };
+        XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2],
+            0, 0, 0, 1.0,
+            diff_xyb);
+        for (int i = 0; i < 3; ++i) {
+            max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]);
+        }
+    }
+
+    int res_ix = res_y * res_xsize + res_x;
+
+    const double kMul = 10;
 
+    result[res_ix * 3] += max_diff_xyb[0] * kMul;
+    result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul;
+    result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
 }
 
-__kernel void ScaleImage(double scale, __global float *result)
+__kernel void clDoMask(
+    __global float *mask_x, __global float *mask_y, __global float *mask_b,
+    __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
+    __global double *lut_x, __global double *lut_y, __global double *lut_b,
+    __global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b)
 {
-	const int i = get_global_id(0);
-	result[i] *= scale;
+    const double w00 = 232.206464018;
+    const double w11 = 22.9455222245;
+    const double w22 = 503.962310606;
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
+
+    const size_t idx = y * xsize + x;
+    const double s0 = mask_x[idx];
+    const double s1 = mask_y[idx];
+    const double s2 = mask_b[idx];
+    const double p0 = w00 * s0;
+    const double p1 = w11 * s1;
+    const double p2 = w22 * s2;
+
+    mask_x[idx] = (float)(InterpolateClampNegative(lut_x, 512, p0));
+    mask_y[idx] = (float)(InterpolateClampNegative(lut_y, 512, p1));
+    mask_b[idx] = (float)(InterpolateClampNegative(lut_b, 512, p2));
+    mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0));
+    mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1));
+    mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2));
+
 }
 
-double DotProduct(__global float u[3], double v[3]) {
-  return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
+__kernel void clBlockDiffMap(__global float* r, __global float* g, __global float* b,
+    __global float* r2, __global float* g2, __global float* b2,
+    __global float* block_diff_dc, __global float* block_diff_ac,
+    int xsize, int ysize, int step)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
+
+    int pos_x = res_x * step;
+    int pos_y = res_y * step;
+
+    if ((pos_x + kBlockEdge - step - 1) >= xsize) return;
+    if ((pos_y + kBlockEdge - step - 1) >= ysize) return;
+
+    size_t res_ix = res_y * res_xsize + res_x;
+    size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8);
+
+    double block0[3 * kBlockEdge * kBlockEdge];
+    double block1[3 * kBlockEdge * kBlockEdge];
+
+    double *block0_r = &block0[0];
+    double *block0_g = &block0[kBlockEdge * kBlockEdge];
+    double *block0_b = &block0[2 * kBlockEdge * kBlockEdge];
+
+    double *block1_r = &block1[0];
+    double *block1_g = &block1[kBlockEdge * kBlockEdge];
+    double *block1_b = &block1[2 * kBlockEdge * kBlockEdge];
+
+    for (int y = 0; y < kBlockEdge; y++)
+    {
+        for (int x = 0; x < kBlockEdge; x++)
+        {
+            block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x];
+            block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x];
+            block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x];
+            block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x];
+            block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x];
+            block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x];
+        }
+    }
+
+    double diff_xyb_dc[3] = { 0.0 };
+    double diff_xyb_ac[3] = { 0.0 };
+    double diff_xyb_edge_dc[3] = { 0.0 };
+
+    ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+
+    for (int i = 0; i < 3; i++)
+    {
+        block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i];
+        block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i];
+    }
 }
 
-__kernel void CombineChannels(
-	__global float *mask_x, __global float *mask_y, __global float *mask_b,
-	__global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
-	__global float *block_diff_dc,
-	__global float *block_diff_ac,
-	__global float *edge_detector_map,
-	int xsize, int ysize,
-	int res_xsize,
-	int step,
-	__global float *result)
+__kernel void clMaskHighIntensityChange(
+    __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+    __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+    __global float *c0_x, __global float *c0_y, __global float *c0_b,
+    __global float *c1_x, __global float *c1_y, __global float *c1_b
+)
 {
-	const int res_x = get_global_id(0) * step;
-	const int res_y = get_global_id(1) * step;
-
-	double mask[3];
-	double dc_mask[3];
-	mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
-	dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)];
-
-	mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
-	dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
-
-	mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
-	dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
-
-	size_t res_ix = (res_y * res_xsize + res_x) / step;
-	result[res_ix] = (float)(
-		DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
-		DotProduct(&block_diff_ac[3 * res_ix], mask) +
-		DotProduct(&edge_detector_map[3 * res_ix], mask));
-	//result[res_ix] = 1;
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
+
+    size_t ix = y * xsize + x;
+    const double ave[3] = {
+        (c0_x[ix] + c1_x[ix]) * 0.5,
+        (c0_y[ix] + c1_y[ix]) * 0.5,
+        (c0_b[ix] + c1_b[ix]) * 0.5,
+    };
+    double sqr_max_diff = -1;
+    {
+        int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+        int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+        for (int dir = 0; dir < 4; ++dir) {
+            if (border[dir]) {
+                continue;
+            }
+            const int ix2 = ix + offset[dir];
+            double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+            diff *= diff;
+            if (sqr_max_diff < diff) {
+                sqr_max_diff = diff;
+            }
+        }
+    }
+    const double kReductionX = 275.19165240059317;
+    const double kReductionY = 18599.41286306991;
+    const double kReductionZ = 410.8995306951065;
+    const double kChromaBalance = 106.95800948271017;
+    double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+    const double mix[3] = {
+        chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+        kReductionY / (sqr_max_diff + kReductionY),
+        chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+    };
+    // Interpolate lineraly between the average color and the actual
+    // color -- to reduce the importance of this pixel.
+    xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+    xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+
+    xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+    xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+
+    xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+    xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
 
-inline double Interpolate(__constant double *array, int size, double sx) {
-	double ix = fabs(sx);
+__kernel void clUpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
 
-	int baseix = (int)(ix);
-	double res;
-	if (baseix >= size - 1) {
-		res = array[size - 1];
-	}
-	else {
-		double mix = ix - baseix;
-		int nextix = baseix + 1;
-		res = array[baseix] + mix * (array[nextix] - array[baseix]);
-	}
-	if (sx < 0) res = -res;
-	return res;
-}
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
 
-#define XybLowFreqToVals_inc 5.2511644570349185
-__constant double XybLowFreqToVals_lut[21] = {
-	0,
-	1 * XybLowFreqToVals_inc,
-	2 * XybLowFreqToVals_inc,
-	3 * XybLowFreqToVals_inc,
-	4 * XybLowFreqToVals_inc,
-	5 * XybLowFreqToVals_inc,
-	6 * XybLowFreqToVals_inc,
-	7 * XybLowFreqToVals_inc,
-	8 * XybLowFreqToVals_inc,
-	9 * XybLowFreqToVals_inc,
-	10 * XybLowFreqToVals_inc,
-	11 * XybLowFreqToVals_inc,
-	12 * XybLowFreqToVals_inc,
-	13 * XybLowFreqToVals_inc,
-	14 * XybLowFreqToVals_inc,
-	15 * XybLowFreqToVals_inc,
-	16 * XybLowFreqToVals_inc,
-	17 * XybLowFreqToVals_inc,
-	18 * XybLowFreqToVals_inc,
-	19 * XybLowFreqToVals_inc,
-	20 * XybLowFreqToVals_inc,
-};
+    const int pos_x = res_x * step;
+    const int pos_y = res_y * step;
 
-void XybLowFreqToVals(double x, double y, double z,
-	double *valx, double *valy, double *valz) {
-	const double xmul = 6.64482198135;
-	const double ymul = 0.837846224276;
-	const double zmul = 7.34905756986;
-	const double y_to_z_mul = 0.0812519812628;
-
-	z += y_to_z_mul * y;
-	*valz = z * zmul;
-	*valx = x * xmul;
-	*valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul);
+    if (pos_y + 8 - step >= ysize) return;
+    if (pos_x + 8 - step >= xsize) return;
+
+    int s2 = (8 - step) / 2;
+
+    // Upsample and take square root.
+    float orig_val = diffmap[res_y * res_xsize + res_x];
+
+    const float kInitialSlope = 100;
+    // TODO(b/29974893): Until that is fixed do not call sqrt on very small
+    // numbers.
+    double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+        ? kInitialSlope * orig_val
+        : sqrt(orig_val);
+
+    for (size_t off_y = 0; off_y < step; ++off_y) {
+        for (size_t off_x = 0; off_x < step; ++off_x) {
+            diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val;
+        }
+    }
 }
 
-void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
-	double r1, double g1, double b1,
-	double factor, double res[3]) {
-	double valx0, valy0, valz0;
-	double valx1, valy1, valz1;
-	XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0);
-	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
-		//PROFILER_ZONE("XybDiff r1=g1=b1=0");
-		res[0] += factor * valx0 * valx0;
-		res[1] += factor * valy0 * valy0;
-		res[2] += factor * valz0 * valz0;
-		return;
-	}
-	XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1);
-	// Approximate the distance of the colors by their respective distances
-	// to gray.
-	double valx = valx0 - valx1;
-	double valy = valy0 - valy1;
-	double valz = valz0 - valz1;
-	res[0] += factor * valx * valx;
-	res[1] += factor * valy * valy;
-	res[2] += factor * valz * valz;
+__kernel void clAverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
+
+    const int row0 = y * xsize;
+    if (x == 0) // excute once per y
+    {
+        img[row0 + 1] += tmp0[row0];
+        img[row0 + 0] += tmp0[row0 + 1];
+        img[row0 + 2] += tmp0[row0 + 1];
+
+        img[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
+        img[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
+        img[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
+
+        if (y > 0) {
+            const int rowd1 = row0 - xsize;
+            img[rowd1 + 1] += tmp1[row0];
+            img[rowd1 + 0] += tmp0[row0];
+
+            img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
+            img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
+        }
+        if (y + 1 < ysize) {
+            const int rowu1 = row0 + xsize;
+            img[rowu1 + 1] += tmp1[row0];
+            img[rowu1 + 0] += tmp0[row0];
+
+            img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
+            img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
+        }
+    }
+
+    if (x >= 2 && x < xsize - 2)
+    {
+        img[row0 + x - 1] += tmp0[row0 + x];
+        img[row0 + x + 1] += tmp0[row0 + x];
+    }
+
+    if (x >= 1 && x < xsize - 1) {
+        if (y > 0) {
+            const int rowd1 = row0 - xsize;
+            img[rowd1 + x + 1] += tmp1[row0 + x];
+            img[rowd1 + x + 0] += tmp0[row0 + x];
+            img[rowd1 + x - 1] += tmp1[row0 + x];
+        }
+        if (y + 1 < ysize) {
+            const int rowu1 = row0 + xsize;
+            img[rowu1 + x + 1] += tmp1[row0 + x];
+            img[rowu1 + x + 0] += tmp0[row0 + x];
+            img[rowu1 + x - 1] += tmp1[row0 + x];
+        }
+    }
 }
 
+
+
+
+
+
 void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_x,
     int pos_y,
@@ -517,105 +758,176 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     }
 }
 
-__kernel void edgeDetectorMap(__global float *result,
-						      __global float *r, __global float *g, __global float* b,
-						      __global float *r2, __global float* g2, __global float *b2,
-						     int xsize, int ysize, int step)
-{
-	const int res_x = get_global_id(0);
-	const int res_y = get_global_id(1);
 
-	const int res_xsize = get_global_size(0);
-	const int res_ysize = get_global_size(1);
 
-	int pos_x = res_x * step;
-	int pos_y = res_y * step;
+double DotProduct(__global float u[3], double v[3]) {
+    return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
+}
 
-	if (pos_x >= xsize - (8 - step)) return;
-	if (pos_y >= ysize - (8 - step)) return;
+double Interpolate(__constant double *array, int size, double sx) {
+    double ix = fabs(sx);
 
-	pos_x = min(pos_x, xsize - 8);
-	pos_y = min(pos_y, ysize - 8);
+    int baseix = (int)(ix);
+    double res;
+    if (baseix >= size - 1) {
+        res = array[size - 1];
+    }
+    else {
+        double mix = ix - baseix;
+        int nextix = baseix + 1;
+        res = array[baseix] + mix * (array[nextix] - array[baseix]);
+    }
+    if (sx < 0) res = -res;
+    return res;
+}
 
-    double diff_xyb[3] = { 0.0 };
-    Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize,
-        r, g, b,
-        r2, g2, b2,
-        &diff_xyb[0]);
+#define XybToVals_off_x 11.38708334481672
+#define XybToVals_inc_x 14.550189611520716
+__constant double XybToVals_lut_x[21] = {
+    0,
+    XybToVals_off_x,
+    XybToVals_off_x + 1 * XybToVals_inc_x,
+    XybToVals_off_x + 2 * XybToVals_inc_x,
+    XybToVals_off_x + 3 * XybToVals_inc_x,
+    XybToVals_off_x + 4 * XybToVals_inc_x,
+    XybToVals_off_x + 5 * XybToVals_inc_x,
+    XybToVals_off_x + 6 * XybToVals_inc_x,
+    XybToVals_off_x + 7 * XybToVals_inc_x,
+    XybToVals_off_x + 8 * XybToVals_inc_x,
+    XybToVals_off_x + 9 * XybToVals_inc_x,
+    XybToVals_off_x + 10 * XybToVals_inc_x,
+    XybToVals_off_x + 11 * XybToVals_inc_x,
+    XybToVals_off_x + 12 * XybToVals_inc_x,
+    XybToVals_off_x + 13 * XybToVals_inc_x,
+    XybToVals_off_x + 14 * XybToVals_inc_x,
+    XybToVals_off_x + 15 * XybToVals_inc_x,
+    XybToVals_off_x + 16 * XybToVals_inc_x,
+    XybToVals_off_x + 17 * XybToVals_inc_x,
+    XybToVals_off_x + 18 * XybToVals_inc_x,
+    XybToVals_off_x + 19 * XybToVals_inc_x,
+};
 
-	int idx = (res_y * res_xsize + res_x) * 3;
-	result[idx]     = diff_xyb[0];
-	result[idx + 1] = diff_xyb[1];
-	result[idx + 2] = diff_xyb[2];
-}
+#define XybToVals_off_y 1.4103373714040413
+#define XybToVals_inc_y 0.7084088867024
+__constant double XybToVals_lut_y[21] = {
+    0,
+    XybToVals_off_y,
+    XybToVals_off_y + 1 * XybToVals_inc_y,
+    XybToVals_off_y + 2 * XybToVals_inc_y,
+    XybToVals_off_y + 3 * XybToVals_inc_y,
+    XybToVals_off_y + 4 * XybToVals_inc_y,
+    XybToVals_off_y + 5 * XybToVals_inc_y,
+    XybToVals_off_y + 6 * XybToVals_inc_y,
+    XybToVals_off_y + 7 * XybToVals_inc_y,
+    XybToVals_off_y + 8 * XybToVals_inc_y,
+    XybToVals_off_y + 9 * XybToVals_inc_y,
+    XybToVals_off_y + 10 * XybToVals_inc_y,
+    XybToVals_off_y + 11 * XybToVals_inc_y,
+    XybToVals_off_y + 12 * XybToVals_inc_y,
+    XybToVals_off_y + 13 * XybToVals_inc_y,
+    XybToVals_off_y + 14 * XybToVals_inc_y,
+    XybToVals_off_y + 15 * XybToVals_inc_y,
+    XybToVals_off_y + 16 * XybToVals_inc_y,
+    XybToVals_off_y + 17 * XybToVals_inc_y,
+    XybToVals_off_y + 18 * XybToVals_inc_y,
+    XybToVals_off_y + 19 * XybToVals_inc_y,
+};
 
-__kernel void edgeDetectorLowFreq(__global float *result,
-	__global float *r, __global float *g, __global float* b,
-	__global float *r2, __global float* g2, __global float *b2,
-	int xsize, int ysize, int step)
+void XybToVals(
+    double x, double y, double z,
+    double *valx, double *valy, double *valz)
 {
-	const int res_x = get_global_id(0);
-	const int res_y = get_global_id(1);
-
-	if (res_x < 8 / step) return;
-
-	const int res_xsize = get_global_size(0);
-	const int res_ysize = get_global_size(1);
-
-	int pos_x = (res_x - (8 / step)) * step;
-	int pos_y = res_y * step;
-
-	if (pos_x + 8 >= xsize) return;
-	if (pos_y + 8 >= ysize) return;
-
-	int ix = pos_y * xsize + pos_x;
-
-	double diff[4][3];
-	__global float* blurred0[3] = { r, g, b };
-	__global float* blurred1[3] = { r2, g2, b2 };
-
-	for (int i = 0; i < 3; ++i) {
-		int ix2 = ix + 8;
-		diff[0][i] =
-			((blurred1[i][ix] - blurred0[i][ix]) +
-			(blurred0[i][ix2] - blurred1[i][ix2]));
-		ix2 = ix + 8 * xsize;
-		diff[1][i] =
-			((blurred1[i][ix] - blurred0[i][ix]) +
-			(blurred0[i][ix2] - blurred1[i][ix2]));
-		ix2 = ix + 6 * xsize + 6;
-		diff[2][i] =
-			((blurred1[i][ix] - blurred0[i][ix]) +
-			(blurred0[i][ix2] - blurred1[i][ix2]));
-		ix2 = ix + 6 * xsize - 6;
-		diff[3][i] = pos_x < 8 ? 0 :
-			((blurred1[i][ix] - blurred0[i][ix]) +
-			(blurred0[i][ix2] - blurred1[i][ix2]));
-	}
-	double max_diff_xyb[3] = { 0 };
-	for (int k = 0; k < 4; ++k) {
-		double diff_xyb[3] = { 0 };
-		XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2],
-			0, 0, 0, 1.0,
-			diff_xyb);
-		for (int i = 0; i < 3; ++i) {
-			max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]);
-		}
-	}
+    const double xmul = 0.758304045695;
+    const double ymul = 2.28148649801;
+    const double zmul = 1.87816926918;
+
+    *valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul);
+    *valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul);
+    *valz = zmul * z;
+}
 
-	int res_ix = res_y * res_xsize + res_x;
 
-	const double kMul = 10;
+#define XybLowFreqToVals_inc 5.2511644570349185
+__constant double XybLowFreqToVals_lut[21] = {
+    0,
+    1 * XybLowFreqToVals_inc,
+    2 * XybLowFreqToVals_inc,
+    3 * XybLowFreqToVals_inc,
+    4 * XybLowFreqToVals_inc,
+    5 * XybLowFreqToVals_inc,
+    6 * XybLowFreqToVals_inc,
+    7 * XybLowFreqToVals_inc,
+    8 * XybLowFreqToVals_inc,
+    9 * XybLowFreqToVals_inc,
+    10 * XybLowFreqToVals_inc,
+    11 * XybLowFreqToVals_inc,
+    12 * XybLowFreqToVals_inc,
+    13 * XybLowFreqToVals_inc,
+    14 * XybLowFreqToVals_inc,
+    15 * XybLowFreqToVals_inc,
+    16 * XybLowFreqToVals_inc,
+    17 * XybLowFreqToVals_inc,
+    18 * XybLowFreqToVals_inc,
+    19 * XybLowFreqToVals_inc,
+    20 * XybLowFreqToVals_inc,
+};
 
-	result[res_ix * 3] += max_diff_xyb[0] * kMul;
-	result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul;
-	result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
+void XybLowFreqToVals(double x, double y, double z,
+    double *valx, double *valy, double *valz) {
+    const double xmul = 6.64482198135;
+    const double ymul = 0.837846224276;
+    const double zmul = 7.34905756986;
+    const double y_to_z_mul = 0.0812519812628;
+
+    z += y_to_z_mul * y;
+    *valz = z * zmul;
+    *valx = x * xmul;
+    *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul);
 }
 
-#define kBlockEdge 8
-#define kBlockSize (kBlockEdge * kBlockEdge)
-#define kBlockEdgeHalf  (kBlockEdge / 2)
-#define kBlockHalf (kBlockEdge * kBlockEdgeHalf)
+
+double InterpolateClampNegative(__global const double *array,
+	int size, double sx) {
+	if (sx < 0) {
+		sx = 0;
+	}
+	double ix = fabs(sx);
+	int baseix = (int)(ix);
+	double res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		double mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	return res;
+}
+
+void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+	double r1, double g1, double b1,
+	double factor, double res[3]) {
+	double valx0, valy0, valz0;
+	double valx1, valy1, valz1;
+	XybLowFreqToVals(r0, g0, b0, &valx0, &valy0, &valz0);
+	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
+		//PROFILER_ZONE("XybDiff r1=g1=b1=0");
+		res[0] += factor * valx0 * valx0;
+		res[1] += factor * valy0 * valy0;
+		res[2] += factor * valz0 * valz0;
+		return;
+	}
+	XybLowFreqToVals(r1, g1, b1, &valx1, &valy1, &valz1);
+	// Approximate the distance of the colors by their respective distances
+	// to gray.
+	double valx = valx0 - valx1;
+	double valy = valy0 - valy1;
+	double valz = valz0 - valz1;
+	res[0] += factor * valx * valx;
+	res[1] += factor * valy * valy;
+	res[2] += factor * valz * valz;
+}
 
 typedef struct __Complex
 {
@@ -1026,361 +1338,80 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
 	}
 }
 
-__kernel void blockDiffMap(__global float* r, __global float* g, __global float* b,
-	__global float* r2, __global float* g2, __global float* b2,
-	__global float* block_diff_dc, __global float* block_diff_ac,
-	int xsize, int ysize, int step)
-{
-	const int res_x = get_global_id(0);
-	const int res_y = get_global_id(1);
-
-	const int res_xsize = get_global_size(0);
-	const int res_ysize = get_global_size(1);
-
-	int pos_x = res_x * step;
-	int pos_y = res_y * step;
-
-	if ((pos_x + kBlockEdge - step - 1) >= xsize) return;
-	if ((pos_y + kBlockEdge - step - 1) >= ysize) return;
-
-	size_t res_ix = res_y * res_xsize + res_x;
-	size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8);
-
-	double block0[3 * kBlockEdge * kBlockEdge];
-	double block1[3 * kBlockEdge * kBlockEdge];
-
-	double *block0_r = &block0[0];
-	double *block0_g = &block0[kBlockEdge * kBlockEdge];
-	double *block0_b = &block0[2 * kBlockEdge * kBlockEdge];
-
-	double *block1_r = &block1[0];
-	double *block1_g = &block1[kBlockEdge * kBlockEdge];
-	double *block1_b = &block1[2 * kBlockEdge * kBlockEdge];
-
-	for (int y = 0; y < kBlockEdge; y++)
-	{
-		for (int x = 0; x < kBlockEdge; x++)
-		{
-			block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x];
-			block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x];
-			block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x];
-			block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x];
-			block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x];
-			block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x];
-		}
-	}
-
-	double diff_xyb_dc[3] = { 0.0 };
-	double diff_xyb_ac[3] = { 0.0 };
-	double diff_xyb_edge_dc[3] = { 0.0 };
 
-	ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
-
-	for (int i = 0; i < 3; i++)
-	{
-		block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i];
-		block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i];
-	}
-}
-__kernel void MaskHighIntensityChange(
-	__global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
-	__global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
-	__global float *c0_x, __global float *c0_y, __global float *c0_b,
-	__global float *c1_x, __global float *c1_y, __global float *c1_b
-	)
-{
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	size_t ix = y * xsize + x;
-	const double ave[3] = {
-	(c0_x[ix] + c1_x[ix]) * 0.5,
-	(c0_y[ix] + c1_y[ix]) * 0.5,
-	(c0_b[ix] + c1_b[ix]) * 0.5,
-	};
-	double sqr_max_diff = -1;
-	{
-		int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
-		int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
-		for (int dir = 0; dir < 4; ++dir) {
-			if (border[dir]) {
-				continue;
-			}
-			const int ix2 = ix + offset[dir];
-			double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
-			diff *= diff;
-			if (sqr_max_diff < diff) {
-			sqr_max_diff = diff;
-			}
-		}
-	}
-	const double kReductionX = 275.19165240059317;
-	const double kReductionY = 18599.41286306991;
-	const double kReductionZ = 410.8995306951065;
-	const double kChromaBalance = 106.95800948271017;
-	double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
-
-	const double mix[3] = {
-		chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
-		kReductionY / (sqr_max_diff + kReductionY),
-		chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
-	};
-	// Interpolate lineraly between the average color and the actual
-	// color -- to reduce the importance of this pixel.
-	xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
-	xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
-
-	xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
-	xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
-
-	xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
-	xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
-}
-
-
-#define XybToVals_off_x 11.38708334481672
-#define XybToVals_inc_x 14.550189611520716
-__constant double XybToVals_lut_x[21] = {
-	0,
-	XybToVals_off_x,
-	XybToVals_off_x + 1 * XybToVals_inc_x,
-	XybToVals_off_x + 2 * XybToVals_inc_x,
-	XybToVals_off_x + 3 * XybToVals_inc_x,
-	XybToVals_off_x + 4 * XybToVals_inc_x,
-	XybToVals_off_x + 5 * XybToVals_inc_x,
-	XybToVals_off_x + 6 * XybToVals_inc_x,
-	XybToVals_off_x + 7 * XybToVals_inc_x,
-	XybToVals_off_x + 8 * XybToVals_inc_x,
-	XybToVals_off_x + 9 * XybToVals_inc_x,
-	XybToVals_off_x + 10 * XybToVals_inc_x,
-	XybToVals_off_x + 11 * XybToVals_inc_x,
-	XybToVals_off_x + 12 * XybToVals_inc_x,
-	XybToVals_off_x + 13 * XybToVals_inc_x,
-	XybToVals_off_x + 14 * XybToVals_inc_x,
-	XybToVals_off_x + 15 * XybToVals_inc_x,
-	XybToVals_off_x + 16 * XybToVals_inc_x,
-	XybToVals_off_x + 17 * XybToVals_inc_x,
-	XybToVals_off_x + 18 * XybToVals_inc_x,
-	XybToVals_off_x + 19 * XybToVals_inc_x,
-};
-
-#define XybToVals_off_y 1.4103373714040413
-#define XybToVals_inc_y 0.7084088867024
-__constant double XybToVals_lut_y[21] = {
-	0,
-	XybToVals_off_y,
-	XybToVals_off_y + 1 * XybToVals_inc_y,
-	XybToVals_off_y + 2 * XybToVals_inc_y,
-	XybToVals_off_y + 3 * XybToVals_inc_y,
-	XybToVals_off_y + 4 * XybToVals_inc_y,
-	XybToVals_off_y + 5 * XybToVals_inc_y,
-	XybToVals_off_y + 6 * XybToVals_inc_y,
-	XybToVals_off_y + 7 * XybToVals_inc_y,
-	XybToVals_off_y + 8 * XybToVals_inc_y,
-	XybToVals_off_y + 9 * XybToVals_inc_y,
-	XybToVals_off_y + 10 * XybToVals_inc_y,
-	XybToVals_off_y + 11 * XybToVals_inc_y,
-	XybToVals_off_y + 12 * XybToVals_inc_y,
-	XybToVals_off_y + 13 * XybToVals_inc_y,
-	XybToVals_off_y + 14 * XybToVals_inc_y,
-	XybToVals_off_y + 15 * XybToVals_inc_y,
-	XybToVals_off_y + 16 * XybToVals_inc_y,
-	XybToVals_off_y + 17 * XybToVals_inc_y,
-	XybToVals_off_y + 18 * XybToVals_inc_y,
-	XybToVals_off_y + 19 * XybToVals_inc_y,
-};
-
-void XybToVals(
-	double x, double y, double z,
-	double *valx, double *valy, double *valz)
-{
-	const double xmul = 0.758304045695;
-    const double ymul = 2.28148649801;
-	const double zmul = 1.87816926918;
-
-	*valx = Interpolate(&XybToVals_lut_x[0], 21, x * xmul);
-	*valy = Interpolate(&XybToVals_lut_y[0], 21, y * ymul);
-	*valz = zmul * z;
-}
-
-__kernel void DiffPrecompute(
-	__global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
-	__global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
-	__global float *mask_x, __global float *mask_y, __global float *mask_b )
-{
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	double valsh0[3] = { 0.0 };
-	double valsv0[3] = { 0.0 };
-	double valsh1[3] = { 0.0 };
-	double valsv1[3] = { 0.0 };
-	int ix2;
-
-	int ix = x + xsize * y;
-	if (x + 1 < xsize) {
-		ix2 = ix + 1;
-	}
-	else {
-		ix2 = ix - 1;
-	}
-	{
-		double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
-		double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
-		double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
-		XybToVals(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]);
-		double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
-		double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
-		double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
-		XybToVals(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]);
-	}
-	if (y + 1 < ysize) {
-		ix2 = ix + xsize;
-	}
-	else {
-		ix2 = ix - xsize;
-	}
-	{
-		double x0 = (xyb0_x[ix] - xyb0_x[ix2]);
-		double y0 = (xyb0_y[ix] - xyb0_y[ix2]);
-		double z0 = (xyb0_b[ix] - xyb0_b[ix2]);
-		XybToVals(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]);
-		double x1 = (xyb1_x[ix] - xyb1_x[ix2]);
-		double y1 = (xyb1_y[ix] - xyb1_y[ix2]);
-		double z1 = (xyb1_b[ix] - xyb1_b[ix2]);
-		XybToVals(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]);
-	}
-
-	double sup0 = fabs(valsh0[0]) + fabs(valsv0[0]);
-	double sup1 = fabs(valsh1[0]) + fabs(valsv1[0]);
-	double m = min(sup0, sup1);
-	mask_x[ix] = (float)(m);
-
-	sup0 = fabs(valsh0[1]) + fabs(valsv0[1]);
-	sup1 = fabs(valsh1[1]) + fabs(valsv1[1]);
-	m = min(sup0, sup1);
-	mask_y[ix] = (float)(m);
-
-	sup0 = fabs(valsh0[2]) + fabs(valsv0[2]);
-	sup1 = fabs(valsh1[2]) + fabs(valsv1[2]);
-	m = min(sup0, sup1);
-	mask_b[ix] = (float)(m);
-}
-
-__kernel void UpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
+void OpsinAbsorbance(const double in[3], double out[3])
 {
-	const int res_x = get_global_id(0);
-	const int res_y = get_global_id(1);
-
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
-
-    const int pos_x = res_x * step;
-    const int pos_y = res_y * step;
-
-	if (pos_y + 8 - step >= ysize) return;
-	if (pos_x + 8 - step >= xsize) return;
-
-	int s2 = (8 - step) / 2;
-
-	// Upsample and take square root.
-	float orig_val = diffmap[res_y * res_xsize + res_x];
-
-	const float kInitialSlope = 100;
-	// TODO(b/29974893): Until that is fixed do not call sqrt on very small
-	// numbers.
-	double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
-		? kInitialSlope * orig_val
-		: sqrt(orig_val);
-
-	for (size_t off_y = 0; off_y < step; ++off_y) {
-		for (size_t off_x = 0; off_x < step; ++off_x) {
-			diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val;
-		}
-	}
+    __constant static float g_mix[12] = {
+        0.348036746003,
+        0.577814843137,
+        0.0544556093735,
+        0.774145581713,
+        0.26922717275,
+        0.767247733938,
+        0.0366922708552,
+        0.920130265014,
+        0.0882062883536,
+        0.158581714673,
+        0.712857943858,
+        10.6524069248,
+    };
+
+    out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3];
+    out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7];
+    out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
 }
 
-kernel void removeBorder(__global float *in, int in_xsize, int s, int s2, __global float *out)
+double EvaluatePolynomial(const double x, __constant const double *coefficients, int n)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
+    double b1 = 0.0;
+    double b2 = 0.0;
 
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
+    for (int i = n - 1; i >= 0; i--)
+    {
+        if (i == 0) {
+            const double x_b1 = x * b1;
+            b1 = x_b1 - b2 + coefficients[0];
+            break;
+        }
+        const double x_b1 = x * b1;
+        const double t = (x_b1 + x_b1) - b2 + coefficients[i];
+        b2 = b1;
+        b1 = t;
+    }
 
-	out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
+    return b1;
 }
 
-kernel void addBorder(__global float *out, int s, int s2, __global float *in)
+double Gamma(double v)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	const double mul1 = 24.8235314874;
-	out[(y + s2) * xsize + x + s2]	+= (float)(mul1) * in[y * (xsize - s) + x];
-
+    static  __constant double g_gamma_p[5 + 1] = {
+        881.979476556478289, 1496.058452015812463, 908.662212739659481,
+        373.566100223287378, 85.840860336314364, 6.683258861509244,
+    };
+
+    static __constant double g_gamma_q[5 + 1] = {
+        12.262350348616792, 20.557285797683576, 12.161463238367844,
+        4.711532733641639, 0.899112889751053, 0.035662329617191,
+    };
+
+    const double min_value = 0.770000000000000;
+    const double max_value = 274.579999999999984;
+    const double x01 = (v - min_value) / (max_value - min_value);
+    const double xc = 2.0 * x01 - 1.0;
+
+    const double yp = EvaluatePolynomial(xc, g_gamma_p, 6);
+    const double yq = EvaluatePolynomial(xc, g_gamma_q, 6);
+    if (yq == 0.0) return 0.0;
+    return (float)(yp / yq);
 }
 
-__kernel void AverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1)
+void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
 {
-	const int x = get_global_id(0);
-	const int y = get_global_id(1);
-	const int xsize = get_global_size(0);
-	const int ysize = get_global_size(1);
-
-	const int row0 = y * xsize;
-	if (x == 0) // excute once per y
-	{
-		img[row0 + 1] += tmp0[row0];
-		img[row0 + 0] += tmp0[row0 + 1];
-		img[row0 + 2] += tmp0[row0 + 1];
-
-		img[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
-		img[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
-		img[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
-
-		if (y > 0) {
-			const int rowd1 = row0 - xsize;
-			img[rowd1 + 1] += tmp1[row0];
-			img[rowd1 + 0] += tmp0[row0];
-
-			img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
-			img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
-		}
-		if (y + 1 < ysize) {
-			const int rowu1 = row0 + xsize;
-			img[rowu1 + 1] += tmp1[row0];
-			img[rowu1 + 0] += tmp0[row0];
-
-			img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
-			img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
-		}
-	}
-
-	if (x >= 2 && x < xsize - 2)
-	{
-		img[row0 + x - 1] += tmp0[row0 + x];
-		img[row0 + x + 1] += tmp0[row0 + x];
-	}
-
-	if (x >= 1 && x < xsize - 1) {
-		if (y > 0) {
-			const int rowd1 = row0 - xsize;
-			img[rowd1 + x + 1] += tmp1[row0 + x];
-			img[rowd1 + x + 0] += tmp0[row0 + x];
-			img[rowd1 + x - 1] += tmp1[row0 + x];
-		}
-		if (y + 1 < ysize) {
-			const int rowu1 = row0 + xsize;
-			img[rowu1 + x + 1] += tmp1[row0 + x];
-			img[rowu1 + x + 0] += tmp0[row0 + x];
-			img[rowu1 + x - 1] += tmp1[row0 + x];
-		}
-	}
-}
+    const double a0 = 1.01611726948;
+    const double a1 = 0.982482243696;
+    const double a2 = 1.43571362627;
+    const double a3 = 0.896039849412;
+    *valx = a0 * r - a1 * g;
+    *valy = a2 * r + a3 * g;
+    *valz = b;
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 5fe8da6d..20f246f3 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -44,25 +44,25 @@ ocl_args_d_t& getOcl(void)
             LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
         }
 	}
-	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "MinSquareVal", &err);
-	ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "Convolution", &err);
-	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "ConvolutionX", &err);
-	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "ConvolutionY", &err);
-	ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "SquareSample", &err);
-	ocl.kernel[KERNEL_DOWNSAMPLE] = clCreateKernel(ocl.program, "DownSample", &err);
-	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "OpsinDynamicsImage", &err);
-	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "DoMask", &err);
-	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "ScaleImage", &err);
-	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "CombineChannels", &err);
-	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "MaskHighIntensityChange", &err);
-	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "DiffPrecompute", &err);
-	ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "UpsampleSquareRoot", &err);
-	ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "addBorder", &err);
-	ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "removeBorder", &err);
-	ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "AverageAddImage", &err);
-	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "edgeDetectorMap", &err);
-	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "blockDiffMap", &err);
-	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "edgeDetectorLowFreq", &err);
+	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareVal", &err);
+	ocl.kernel[KERNEL_CONVOLUTION] =  clCreateKernel(ocl.program, "clConvolution", &err);
+	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionX", &err);
+	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionY", &err);
+	ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSample", &err);
+	ocl.kernel[KERNEL_DOWNSAMPLE] =   clCreateKernel(ocl.program, "clDownSample", &err);
+	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImage", &err);
+	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMask", &err);
+	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImage", &err);
+	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannels", &err);
+	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChange", &err);
+	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecompute", &err);
+	ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRoot", &err);
+	ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorder", &err);
+	ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorder", &err);
+	ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "clAverageAddImage", &err);
+	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
+	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
+	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
 
 	return ocl;
 }
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index a287c8cc..a25ddc08 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -5,6 +5,14 @@
 extern bool g_useOpenCL;
 extern bool g_checkOpenCL;
 
+void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
+
+void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
+    float* r2, float* g2, float* b2,
+    size_t xsize, size_t ysize,
+    size_t step,
+    float* result);
+
 void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
 	ocl_channels xyb1/*in,out*/,
 	size_t xsize, size_t ysize);
@@ -27,14 +35,6 @@ void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double bor
 
 void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize);
 
-void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
-
-void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
-	float* r2, float* g2, float* b2,
-	size_t xsize, size_t ysize,
-	size_t step,
-	float* result);
-
 void clCombineChannelsEx(
 	ocl_channels mask,
 	ocl_channels mask_dc,

From 7c97e95f2096197250782739a6dbb2019876522f Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 13 May 2017 09:30:50 +0800
Subject: [PATCH 065/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3n=E5=8D=A1=E4=B8=8A?=
 =?UTF-8?q?=E7=9A=84=E7=BC=96=E8=AF=91=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 49 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index d44f5c07..9d132ae8 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1338,24 +1338,23 @@ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
 	}
 }
 
+__constant static float g_mix[12] = {
+    0.348036746003,
+    0.577814843137,
+    0.0544556093735,
+    0.774145581713,
+    0.26922717275,
+    0.767247733938,
+    0.0366922708552,
+    0.920130265014,
+    0.0882062883536,
+    0.158581714673,
+    0.712857943858,
+    10.6524069248,
+};
 
 void OpsinAbsorbance(const double in[3], double out[3])
 {
-    __constant static float g_mix[12] = {
-        0.348036746003,
-        0.577814843137,
-        0.0544556093735,
-        0.774145581713,
-        0.26922717275,
-        0.767247733938,
-        0.0366922708552,
-        0.920130265014,
-        0.0882062883536,
-        0.158581714673,
-        0.712857943858,
-        10.6524069248,
-    };
-
     out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3];
     out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7];
     out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
@@ -1382,18 +1381,18 @@ double EvaluatePolynomial(const double x, __constant const double *coefficients,
     return b1;
 }
 
-double Gamma(double v)
-{
-    static  __constant double g_gamma_p[5 + 1] = {
-        881.979476556478289, 1496.058452015812463, 908.662212739659481,
-        373.566100223287378, 85.840860336314364, 6.683258861509244,
-    };
+static  __constant double g_gamma_p[5 + 1] = {
+    881.979476556478289, 1496.058452015812463, 908.662212739659481,
+    373.566100223287378, 85.840860336314364, 6.683258861509244,
+};
 
-    static __constant double g_gamma_q[5 + 1] = {
-        12.262350348616792, 20.557285797683576, 12.161463238367844,
-        4.711532733641639, 0.899112889751053, 0.035662329617191,
-    };
+static __constant double g_gamma_q[5 + 1] = {
+    12.262350348616792, 20.557285797683576, 12.161463238367844,
+    4.711532733641639, 0.899112889751053, 0.035662329617191,
+};
 
+double Gamma(double v)
+{
     const double min_value = 0.770000000000000;
     const double max_value = 274.579999999999984;
     const double x01 = (v - min_value) / (max_value - min_value);

From 5eb14f3b5d4065b3acd9c588bb47bc200e3d79cf Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 13 May 2017 10:50:03 +0800
Subject: [PATCH 066/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clbutter_comparator.cpp  | 17 ++++++
 clguetzli/clguetzli.cpp            | 89 ++++++++++++++++++++++++++++--
 clguetzli/clguetzli.h              |  6 ++
 clguetzli/clguetzli_comparator.cpp | 44 +++------------
 guetzli/processor.cc               | 54 +++++++++++++++++-
 5 files changed, 166 insertions(+), 44 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 650e4373..1da9a2cd 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -153,6 +153,23 @@ namespace butteraugli
         std::vector<std::vector<float> > *mask,
         std::vector<std::vector<float> > *mask_dc)
     {
+        if (g_useOpenCL)
+        {
+            mask->resize(3);
+            mask_dc->resize(3);
+            for (int i = 0; i < 3; i++)
+            {
+                (*mask)[i].resize(xsize * ysize);
+                (*mask_dc)[i].resize(xsize * ysize);
+            }
+            clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+                xsize, ysize,
+                (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+            return;
+        }
+
         _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
 
         if (g_checkOpenCL)
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 20f246f3..32d0d77b 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -765,7 +765,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	double scaler = 0.0738288224836;
 	double mul = 20.8029176447;
 	static double lut_x[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+    static bool lutx_init = false;
+    if (!lutx_init)
+    {
+        lutx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+    }
 
 	extmul = 0.373995618954;
 	extoff = 1.5307267433;
@@ -773,7 +778,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	scaler = 1.1731667845;
 	mul = 16.2447033988;
 	static double lut_y[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+    static bool luty_init = false;
+    if (!luty_init)
+    {
+        luty_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+    }
 
 	extmul = 0.61582234137;
 	extoff = -4.25376118646;
@@ -781,7 +791,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	scaler = 0.47434643535;
 	mul = 31.1444967089;
 	static double lut_b[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+    static bool lutb_init = false;
+    if (!lutb_init)
+    {
+        lutb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+    }
 
 	extmul = 1.79116943438;
 	extoff = -3.86797479189;
@@ -789,7 +804,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	scaler = 0.486575865525;
 	mul = 20.4563479139;
 	static double lut_dcx[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+    static bool lutdcx_init = false;
+    if (!lutdcx_init)
+    {
+        lutdcx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+    }
 
 	extmul = 0.212223514236;
 	extoff = -3.65647120524;
@@ -797,7 +817,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	scaler = 0.170392660501;
 	mul = 21.6566724788;
 	static double lut_dcy[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+    static bool lutdcy_init = false;
+    if (!lutdcy_init)
+    {
+        lutdcy_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+    }
 
 	extmul = 0.349376011816;
 	extoff = -0.894711072781;
@@ -805,7 +830,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	scaler = 0.380086095024;
 	mul = 18.0373825149;
 	static double lut_dcb[512];
-	MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+    static bool lutdcb_init = false;
+    if (!lutdcb_init)
+    {
+        lutdcb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+    }
 
 	size_t channel_size = 512 * 3 * sizeof(double);
 	ocl_channels xyb = ocl.allocMemChannels(channel_size);
@@ -876,6 +906,53 @@ void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
     }
 }
 
+void clMask(const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    size_t xsize, size_t ysize,
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b)
+{
+    cl_int err = CL_SUCCESS;
+    ocl_args_d_t &ocl = getOcl();
+
+    cl_int channel_size = xsize * ysize * sizeof(float);
+
+    ocl_channels rgb = ocl.allocMemChannels(channel_size);
+    ocl_channels rgb2 = ocl.allocMemChannels(channel_size);
+    ocl_channels mask = ocl.allocMemChannels(channel_size);
+    ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+
+    clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
+    clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
+    clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
+    clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
+    clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
+    clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
+    err = clFinish(ocl.commandQueue);
+
+    clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc);
+
+    cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+
+    memcpy(mask_r, r0_r, channel_size);
+    memcpy(mask_g, r0_g, channel_size);
+    memcpy(mask_b, r0_b, channel_size);
+    memcpy(maskdc_r, r1_r, channel_size);
+    memcpy(maskdc_g, r1_g, channel_size);
+    memcpy(maskdc_b, r1_b, channel_size);
+
+    ocl.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb2);
+    ocl.releaseMemChannels(mask);
+    ocl.releaseMemChannels(mask_dc);
+}
+
 void clCombineChannelsEx(
 	ocl_channels mask,
 	ocl_channels mask_dc,
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index a25ddc08..aa595ab5 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -13,6 +13,12 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
+void clMask(const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    size_t xsize, size_t ysize,
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b);
+
 void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
 	ocl_channels xyb1/*in,out*/,
 	size_t xsize, size_t ysize);
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index ce3a9b64..03653754 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -238,36 +238,13 @@ void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_)
 // out = [YUVYUV....YUVYUV]
 void ImageToYUV(uint16_t *pixels_, uint8_t *out)
 {
-	const int ymin = 0;
-	const int xmin = 0;
-	const int ysize = 8;
-	const int xsize = 8;
-	const int width_ = 8;
-	const int height_ = 8;
 	const int stride = 3;
 
-	const int yend1 = ymin + ysize;
-	const int yend0 = std::min(yend1, height_);
-	int y = ymin;
-	for (; y < yend0; ++y) {
-		const int xend1 = xmin + xsize;
-		const int xend0 = std::min(xend1, width_);
-		int x = xmin;
-		int px = y * width_ + xmin;
-		for (; x < xend0; ++x, ++px, out += stride) {
+	for (int y = 0; y < 8; ++y) {
+		for (int x = 0; x < 8; ++x) {
+            int px = y * 8 + x;
 			*out = static_cast<uint8_t>((pixels_[px] + 8 - (x & 1)) >> 4);
-		}
-		const int offset = -stride;
-		for (; x < xend1; ++x) {
-			*out = out[offset];
-			out += stride;
-		}
-	}
-	for (; y < yend1; ++y) {
-		const int offset = -stride * xsize;
-		for (int x = 0; x < xsize; ++x) {
-			*out = out[offset];
-			out += stride;
+            out += stride;
 		}
 	}
 }
@@ -305,10 +282,10 @@ void BlockToImage(coeff_t *block, float* r, float* g, float* b)
 	uint8_t yuv[8 * 8 * 3];
 
 	ImageToYUV(&pixels[0], &yuv[0]);
-	ImageToYUV(&pixels[8*8], &yuv[8*8]);
-	ImageToYUV(&pixels[8*8*2], &yuv[8*8*2]);
+	ImageToYUV(&pixels[8*8], &yuv[1]);
+	ImageToYUV(&pixels[8*8*2], &yuv[2]);
 
-	YUVToRGB(yuv);
+    YUVToRGB(yuv);
 
 	const double* lut = Srgb8ToLinearTable();
 	for (int i = 0; i < 8 * 8; i++)
@@ -341,17 +318,14 @@ namespace guetzli
 
 	double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block)
 	{
-        return 0;
-		int block_x = block_x_ * factor_x_ + off_x;
+	int block_x = block_x_ * factor_x_ + off_x;
 		int block_y = block_y_ * factor_y_ + off_y;
 		int xmin = 8 * block_x;
 		int ymin = 8 * block_y;
 		int block_ix = off_y * factor_x_ + off_x;
 		const std::vector<std::vector<float> >& rgb0_c = per_block_pregamma_[block_ix];
 
-		std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
-		img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
-
+        //
 		std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
 		BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data());
 
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 62613d04..c8684b35 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -55,6 +55,8 @@ class Processor {
                        ProcessStats* stats);
 
  private:
+     void CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img);
+
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
                               bool stop_early);
@@ -434,7 +436,7 @@ void Processor::ComputeBlockZeroingOrder(
       }
 
       float max_err = 0;
-
+/*
       for (int iy = 0; iy < factor_y; ++iy) {
         for (int ix = 0; ix < factor_x; ++ix) {
           int block_xx = block_x * factor_x + ix;
@@ -445,8 +447,8 @@ void Processor::ComputeBlockZeroingOrder(
           }
         }
       }
-
-	  /*max_err = */((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block);
+*/
+	  max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block);
 
       if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
         best_err = max_err;
@@ -558,6 +560,52 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 
 }  // namespace
 
+void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img)
+{
+    // we only support factor_x == factor_y == 1
+    const int width = img->width();
+    const int height = img->height();
+    const int factor_x = 1;
+    const int factor_y = 1;
+
+    const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+    const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+    const int num_blocks = block_width * block_height;
+
+    comparator_->StartBlockComparisons(); // TOBEREMOVE:��ʼ��һЩ����
+    std::vector<coeff_t> orig_block_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
+    std::vector<coeff_t> block_batch(num_blocks * kBlockSize);        // [block_r block_g block_b]
+
+    for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+        for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+            coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
+            coeff_t *block = &block_batch[block_ix * kBlockSize];
+
+            for (int c = 0; c < 3; ++c)
+            {
+                img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); // TOBEREMOVE:ȡ���Ա�ͼ��blockϵ��
+
+                const JPEGComponent& comp = jpg.components[c];
+                int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+                memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:ȡ��ԭʼͼ��blockϵ��
+            }
+
+/*
+            std::vector<CoeffData> block_order;
+            block_order.clear();
+            ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order); // TOBEREMOVE:����ԭʼblock�ͶԱ�ͼ��block����zeroing order����block_order
+            candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
+            for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:�ѽ����ֵ����ѡϵ��
+                candidate_coeffs.push_back(block_order[i].idx);
+                candidate_coeff_errors.push_back(block_order[i].block_err);
+            }
+*/
+        }
+    }
+
+    comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
+}
+
 void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                                        const uint8_t comp_mask,
                                        const double target_mul,

From f12e272387e3480a0f9ff9cf2827de35cf6da16f Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 13 May 2017 11:43:32 +0800
Subject: [PATCH 067/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=20SelectFrequencyMas?=
 =?UTF-8?q?kingBatch=20=E5=8C=96=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli_comparator.cpp |   8 +-
 clguetzli/clguetzli_comparator.h   |   2 +-
 guetzli/processor.cc               | 561 ++++++++++++++++++-----------
 3 files changed, 350 insertions(+), 221 deletions(-)

diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 03653754..5970e6d8 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -316,13 +316,13 @@ namespace guetzli
 		ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
 	}
 
-	double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block)
+	double ButteraugliComparatorEx::CompareBlockEx(coeff_t* candidate_block)
 	{
-	int block_x = block_x_ * factor_x_ + off_x;
-		int block_y = block_y_ * factor_y_ + off_y;
+	int block_x = block_x_ * factor_x_;
+		int block_y = block_y_ * factor_y_;
 		int xmin = 8 * block_x;
 		int ymin = 8 * block_y;
-		int block_ix = off_y * factor_x_ + off_x;
+		int block_ix = 0;
 		const std::vector<std::vector<float> >& rgb0_c = per_block_pregamma_[block_ix];
 
         //
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
index 778d0532..96642ce6 100644
--- a/clguetzli/clguetzli_comparator.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -16,7 +16,7 @@ namespace guetzli {
 		void StartBlockComparisons();
 		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y);
 
-		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, coeff_t* candidate_block);
+		double CompareBlockEx(coeff_t* candidate_block);
 
 	protected:
 		std::vector<float> imgOpsinDynamicsBlockList;
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index c8684b35..d1607c14 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -55,16 +55,30 @@ class Processor {
                        ProcessStats* stats);
 
  private:
-     void CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img);
+
+     void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early);
 
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
                               bool stop_early);
+
+  void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
+      const uint8_t comp_mask,
+      const double target_mul,
+      bool stop_early,
+      std::vector<int> &candidate_coeff_offsets,
+      std::vector<uint8_t>& candidate_coeffs,
+      std::vector<float> &candidate_coeff_errors);
+
   void ComputeBlockZeroingOrder(
       const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
       const int block_x, const int block_y, const int factor_x,
       const int factor_y, const uint8_t comp_mask, OutputImage* img,
       std::vector<CoeffData>* output_order);
+
+  void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
+      const int block_x, const int block_y, std::vector<CoeffData>* output_order);
+
   bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
                          int best_q[3][kDCTBlockSize],
                          OutputImage* img);
@@ -365,41 +379,6 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
 }
 
 
-void func(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
-	const uint8_t comp_mask, guetzli::Params &params_, std::vector<std::pair<int, float> > &input_order)
-{
-	static const uint8_t oldCsf[kDCTBlockSize] = {
-		10, 10, 20, 40, 60, 70, 80, 90,
-		10, 20, 30, 60, 70, 80, 90, 90,
-		20, 30, 60, 70, 80, 90, 90, 90,
-		40, 60, 70, 80, 90, 90, 90, 90,
-		60, 70, 80, 90, 90, 90, 90, 90,
-		70, 80, 90, 90, 90, 90, 90, 90,
-		80, 90, 90, 90, 90, 90, 90, 90,
-		90, 90, 90, 90, 90, 90, 90, 90,
-	};
-	static const double kWeight[3] = { 1.0, 0.22, 0.20 };
-#include "guetzli/order.inc"
-
-	for (int c = 0; c < 3; ++c) { // TOBEREMOVE:��������block��input_order,��0�Ĵ��
-		if (!(comp_mask & (1 << c))) continue;
-		for (int k = 1; k < kDCTBlockSize; ++k) {
-			int idx = c * kDCTBlockSize + k; // TOBEREMOVE:ÿ����������
-			if (block[idx] != 0) {
-				float score;
-				if (params_.new_zeroing_model) {
-					score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
-				}
-				else {
-					score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]);
-				}
-				input_order.push_back(std::make_pair(idx, score));
-			}
-		}
-	}
-	std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
-
-}
 
 // REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<<c)) == 0.
 void Processor::ComputeBlockZeroingOrder(
@@ -408,8 +387,39 @@ void Processor::ComputeBlockZeroingOrder(
     const int factor_y, const uint8_t comp_mask, OutputImage* img,
     std::vector<CoeffData>* output_order) {
 
-	std::vector<std::pair<int, float> > input_order;
-	func(block, orig_block, comp_mask, params_, input_order);
+  static const uint8_t oldCsf[kDCTBlockSize] = {
+      10, 10, 20, 40, 60, 70, 80, 90,
+      10, 20, 30, 60, 70, 80, 90, 90,
+      20, 30, 60, 70, 80, 90, 90, 90,
+      40, 60, 70, 80, 90, 90, 90, 90,
+      60, 70, 80, 90, 90, 90, 90, 90,
+      70, 80, 90, 90, 90, 90, 90, 90,
+      80, 90, 90, 90, 90, 90, 90, 90,
+      90, 90, 90, 90, 90, 90, 90, 90,
+  };
+  static const double kWeight[3] = { 1.0, 0.22, 0.20 };
+#include "guetzli/order.inc"
+  std::vector<std::pair<int, float> > input_order;
+  for (int c = 0; c < 3; ++c) {
+    if (!(comp_mask & (1 << c))) continue;
+    for (int k = 1; k < kDCTBlockSize; ++k) {
+      int idx = c * kDCTBlockSize + k;
+      if (block[idx] != 0) {
+        float score;
+        if (params_.new_zeroing_model) {
+          score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
+        } else {
+          score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) *
+                  kWeight[c] / oldCsf[k]);
+        }
+        input_order.push_back(std::make_pair(idx, score));
+      }
+    }
+  }
+  std::sort(input_order.begin(), input_order.end(),
+            [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
+              return a.second < b.second; });
+
 
 	coeff_t processed_block[kBlockSize];
 	memcpy(processed_block, block, sizeof(processed_block));
@@ -436,7 +446,7 @@ void Processor::ComputeBlockZeroingOrder(
       }
 
       float max_err = 0;
-/*
+
       for (int iy = 0; iy < factor_y; ++iy) {
         for (int ix = 0; ix < factor_x; ++ix) {
           int block_xx = block_x * factor_x + ix;
@@ -447,8 +457,6 @@ void Processor::ComputeBlockZeroingOrder(
           }
         }
       }
-*/
-	  max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(*img, 0, 0, candidate_block);
 
       if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
         best_err = max_err;
@@ -560,7 +568,7 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 
 }  // namespace
 
-void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage* img)
+void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early)
 {
     // we only support factor_x == factor_y == 1
     const int width = img->width();
@@ -576,6 +584,7 @@ void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage*
     std::vector<coeff_t> orig_block_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
     std::vector<coeff_t> block_batch(num_blocks * kBlockSize);        // [block_r block_g block_b]
 
+    // step 1 ��ȡ����block list
     for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
         for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
             coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
@@ -583,27 +592,124 @@ void Processor::CompareBlockZeroingOrderBatch(const JPEGData& jpg, OutputImage*
 
             for (int c = 0; c < 3; ++c)
             {
-                img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]); // TOBEREMOVE:ȡ���Ա�ͼ��blockϵ��
+                img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]);
 
                 const JPEGComponent& comp = jpg.components[c];
                 int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
                 memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:ȡ��ԭʼͼ��blockϵ��
             }
+        }
+    }
+
+    std::vector<int> candidate_coeff_offsets(num_blocks + 1);
+    std::vector<uint8_t> candidate_coeffs;
+    std::vector<float> candidate_coeff_errors;
+
+    // step 2 �Ա�ÿ��block���
+    for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+        for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+            coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
+            coeff_t *block = &block_batch[block_ix * kBlockSize];
 
-/*
             std::vector<CoeffData> block_order;
-            block_order.clear();
-            ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order); // TOBEREMOVE:����ԭʼblock�ͶԱ�ͼ��block����zeroing order����block_order
+
+            ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
+
+            // ���´�����Ȼû��batch���������ȼ�������������
             candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
-            for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:�ѽ����ֵ����ѡϵ��
+            for (size_t i = 0; i < block_order.size(); ++i) {
                 candidate_coeffs.push_back(block_order[i].idx);
                 candidate_coeff_errors.push_back(block_order[i].block_err);
             }
-*/
         }
     }
 
+    //
     comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
+
+    SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early,
+        candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
+
+}
+
+void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
+    const int block_x, const int block_y, std::vector<CoeffData>* output_order)
+{
+    static const uint8_t oldCsf[kDCTBlockSize] = {
+        10, 10, 20, 40, 60, 70, 80, 90,
+        10, 20, 30, 60, 70, 80, 90, 90,
+        20, 30, 60, 70, 80, 90, 90, 90,
+        40, 60, 70, 80, 90, 90, 90, 90,
+        60, 70, 80, 90, 90, 90, 90, 90,
+        70, 80, 90, 90, 90, 90, 90, 90,
+        80, 90, 90, 90, 90, 90, 90, 90,
+        90, 90, 90, 90, 90, 90, 90, 90,
+    };
+    static const double kWeight[3] = { 1.0, 0.22, 0.20 };
+#include "guetzli/order.inc"
+    std::vector<std::pair<int, float> > input_order;
+    for (int c = 0; c < 3; ++c) {
+        for (int k = 1; k < kDCTBlockSize; ++k) {
+            int idx = c * kDCTBlockSize + k;
+            if (block[idx] != 0) {
+                float score;
+                if (params_.new_zeroing_model) {
+                    score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
+                }
+                else {
+                    score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]);
+                }
+                input_order.push_back(std::make_pair(idx, score));
+            }
+        }
+    }
+    std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
+
+    coeff_t processed_block[kBlockSize];
+    memcpy(processed_block, block, sizeof(processed_block));
+
+    comparator_->SwitchBlock(block_x, block_y, 1, 1);
+
+    while (!input_order.empty()) {
+        float best_err = 1e17f;
+        int best_i = 0;
+        for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead, input_order.size()); ++i)
+        {
+            coeff_t candidate_block[kBlockSize];
+            memcpy(candidate_block, processed_block, sizeof(candidate_block));
+
+            const int idx = input_order[i].first;
+
+            candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
+
+            float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(candidate_block);
+            if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
+                best_err = max_err;
+                best_i = i;
+            }
+        }
+
+        int idx = input_order[best_i].first;
+        processed_block[idx] = 0;
+        input_order.erase(input_order.begin() + best_i);
+
+        output_order->push_back({ idx, best_err }); // TOBEREMOVE:����������������С�����idx����Ӧ���Ա�block�еĶ�Ӧλ����������Ϊ0,�Ƴ�input_order���ѡȡ��ǰֵ������output_order,����ʽ�����õ��Ա�ͼ����ȥ��
+    }
+
+    // TOBEREMOVE:�����Ƴ�err������error���Ƶ���أ�����ԭ�Ա�ͼ��ԭʼֵ��
+    // Make the block error values monotonic.
+    float min_err = 1e10;
+    for (int i = output_order->size() - 1; i >= 0; --i) {
+        min_err = std::min(min_err, (*output_order)[i].block_err);
+        (*output_order)[i].block_err = min_err;
+    }
+    // Cut off at the block error limit.
+    size_t num = 0;
+    while (num < output_order->size() &&
+        (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) {
+        ++num;
+    }
+    output_order->resize(num);
 }
 
 void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
@@ -612,7 +718,6 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                                        bool stop_early) {
   const int width = img->width();
   const int height = img->height();
-  const int ncomp = jpg.components.size();
   const int last_c = Log2FloorNonZero(comp_mask);
   if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
   const int factor_x = img->component(last_c).factor_x();
@@ -659,183 +764,207 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
   comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
   candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
 
-  std::vector<JpegHistogram> ac_histograms(ncomp);
-  int jpg_header_size, dc_size;
-  {
-    JPEGData jpg_out = jpg;
-    img->SaveToJpegData(&jpg_out);
-    jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata);
-    dc_size = EstimateDCSize(jpg_out);
-    BuildACHistograms(jpg_out, &ac_histograms[0]);
-  }
-  std::vector<uint8_t> ac_depths;
-  int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
-  int base_size = jpg_header_size + dc_size + ac_histogram_size +
-      EntropyCodedDataSize(ac_histograms, ac_depths);
-  int prev_size = base_size;
-
-  std::vector<float> max_block_error(num_blocks);
-  std::vector<int> last_indexes(num_blocks);
-
-  bool first_up_iter = true;
-  for (int direction : {1, -1}) {
-    for (;;) {
-      if (stop_early && direction == -1) {
-        if (prev_size > 1.01 * final_output_->jpeg_data.size()) {
-          // If we are down-adjusting the error, the output size will only keep
-          // increasing.
-          // TODO(user): Do this check always by comparing only the size
-          // of the currently processed components.
-          break;
-        }
-      }
-      std::vector<std::pair<int, float> > global_order;
-      int blocks_to_change;
-      std::vector<float> block_weight;
-      for (int rblock = 1; rblock <= 4; ++rblock) {
-        block_weight = std::vector<float>(num_blocks);
-        std::vector<float> distmap(width * height);
-        if (!first_up_iter) {
-          distmap = comparator_->distmap();
-        }
-        comparator_->ComputeBlockErrorAdjustmentWeights(
-            direction, rblock, target_mul, factor_x, factor_y, distmap,
-            &block_weight);
-        global_order.clear();
-        blocks_to_change = 0;
-        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
-          for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-            const int last_index = last_indexes[block_ix];
-            const int offset = candidate_coeff_offsets[block_ix];
-            const int num_candidates =
-                candidate_coeff_offsets[block_ix + 1] - offset;
-            const float* candidate_errors = &candidate_coeff_errors[offset];
-            const float max_err = max_block_error[block_ix];
-            if (block_weight[block_ix] == 0) {
-              continue;
+  SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early,
+      candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
+}
+
+void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
+                                        const uint8_t comp_mask,
+                                        const double target_mul,
+                                        bool stop_early,
+                                        std::vector<int> &candidate_coeff_offsets,
+                                        std::vector<uint8_t>& candidate_coeffs,
+                                        std::vector<float> &candidate_coeff_errors)
+{
+    const int ncomp = jpg.components.size();
+    const int width = img->width();
+    const int height = img->height();
+    const int last_c = Log2FloorNonZero(comp_mask);
+    if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
+    const int factor_x = img->component(last_c).factor_x();
+    const int factor_y = img->component(last_c).factor_y();
+    const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+    const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+    const int num_blocks = block_width * block_height;
+
+    std::vector<JpegHistogram> ac_histograms(ncomp);
+    int jpg_header_size, dc_size;
+    {
+        JPEGData jpg_out = jpg;
+        img->SaveToJpegData(&jpg_out);
+        jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata);
+        dc_size = EstimateDCSize(jpg_out);
+        BuildACHistograms(jpg_out, &ac_histograms[0]);
+    }
+    std::vector<uint8_t> ac_depths;
+    int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
+    int base_size = jpg_header_size + dc_size + ac_histogram_size +
+        EntropyCodedDataSize(ac_histograms, ac_depths);
+    int prev_size = base_size;
+
+    std::vector<float> max_block_error(num_blocks);
+    std::vector<int> last_indexes(num_blocks);
+
+    bool first_up_iter = true;
+    for (int direction : {1, -1}) {
+        for (;;) {
+            if (stop_early && direction == -1) {
+                if (prev_size > 1.01 * final_output_->jpeg_data.size()) {
+                    // If we are down-adjusting the error, the output size will only keep
+                    // increasing.
+                    // TODO(user): Do this check always by comparing only the size
+                    // of the currently processed components.
+                    break;
+                }
             }
-            if (direction > 0) {
-              for (size_t i = last_index; i < num_candidates; ++i) {
-                float val = ((candidate_errors[i] - max_err) /
-                             block_weight[block_ix]);
-                global_order.push_back(std::make_pair(block_ix, val));
-              }
-              blocks_to_change += (last_index < num_candidates ? 1 : 0);
-            } else {
-              for (int i = last_index - 1; i >= 0; --i) {
-                float val = ((max_err - candidate_errors[i]) /
-                             block_weight[block_ix]);
-                global_order.push_back(std::make_pair(block_ix, val));
-              }
-              blocks_to_change += (last_index > 0 ? 1 : 0);
+            std::vector<std::pair<int, float> > global_order;
+            int blocks_to_change;
+            std::vector<float> block_weight;
+            for (int rblock = 1; rblock <= 4; ++rblock) {
+                block_weight = std::vector<float>(num_blocks);
+                std::vector<float> distmap(width * height);
+                if (!first_up_iter) {
+                    distmap = comparator_->distmap();
+                }
+                comparator_->ComputeBlockErrorAdjustmentWeights(
+                    direction, rblock, target_mul, factor_x, factor_y, distmap,
+                    &block_weight);
+                global_order.clear();
+                blocks_to_change = 0;
+                for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+                    for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+                        const int last_index = last_indexes[block_ix];
+                        const int offset = candidate_coeff_offsets[block_ix];
+                        const int num_candidates =
+                            candidate_coeff_offsets[block_ix + 1] - offset;
+                        const float* candidate_errors = &candidate_coeff_errors[offset];
+                        const float max_err = max_block_error[block_ix];
+                        if (block_weight[block_ix] == 0) {
+                            continue;
+                        }
+                        if (direction > 0) {
+                            for (size_t i = last_index; i < num_candidates; ++i) {
+                                float val = ((candidate_errors[i] - max_err) /
+                                    block_weight[block_ix]);
+                                global_order.push_back(std::make_pair(block_ix, val));
+                            }
+                            blocks_to_change += (last_index < num_candidates ? 1 : 0);
+                        }
+                        else {
+                            for (int i = last_index - 1; i >= 0; --i) {
+                                float val = ((max_err - candidate_errors[i]) /
+                                    block_weight[block_ix]);
+                                global_order.push_back(std::make_pair(block_ix, val));
+                            }
+                            blocks_to_change += (last_index > 0 ? 1 : 0);
+                        }
+                    }
+                }
+                if (!global_order.empty()) {
+                    // If we found something to adjust with the current block adjustment
+                    // radius, we can stop and adjust the blocks we have.
+                    break;
+                }
             }
-          }
-        }
-        if (!global_order.empty()) {
-          // If we found something to adjust with the current block adjustment
-          // radius, we can stop and adjust the blocks we have.
-          break;
-        }
-      }
 
-      if (global_order.empty()) {
-        break;
-      }
+            if (global_order.empty()) {
+                break;
+            }
 
-      std::sort(global_order.begin(), global_order.end(),
+            std::sort(global_order.begin(), global_order.end(),
                 [](const std::pair<int, float>& a,
-                   const std::pair<int, float>& b) {
-                  return a.second < b.second; });
+                    const std::pair<int, float>& b) {
+                return a.second < b.second; });
 
-      double rel_size_delta = direction > 0 ? 0.01 : 0.0005;
-      if (direction > 0 && comparator_->DistanceOK(1.0)) {
-        rel_size_delta = 0.05;
-      }
-      double min_size_delta = base_size * rel_size_delta;
-
-      float coeffs_to_change_per_block =
-          direction > 0 ? 2.0f : factor_x * factor_y * 0.2f;
-      int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change;
-
-      if (first_up_iter) {
-        const float limit = 0.75f * comparator_->BlockErrorLimit();
-        auto it = std::partition_point(global_order.begin(), global_order.end(),
-                                       [=](const std::pair<int, float>& a) {
-                                         return a.second < limit; });
-        min_coeffs_to_change = std::max<int>(min_coeffs_to_change,
-                                             it - global_order.begin());
-        first_up_iter = false;
-      }
+            double rel_size_delta = direction > 0 ? 0.01 : 0.0005;
+            if (direction > 0 && comparator_->DistanceOK(1.0)) {
+                rel_size_delta = 0.05;
+            }
+            double min_size_delta = base_size * rel_size_delta;
+
+            float coeffs_to_change_per_block =
+                direction > 0 ? 2.0f : factor_x * factor_y * 0.2f;
+            int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change;
+
+            if (first_up_iter) {
+                const float limit = 0.75f * comparator_->BlockErrorLimit();
+                auto it = std::partition_point(global_order.begin(), global_order.end(),
+                    [=](const std::pair<int, float>& a) {
+                    return a.second < limit; });
+                min_coeffs_to_change = std::max<int>(min_coeffs_to_change,
+                    it - global_order.begin());
+                first_up_iter = false;
+            }
 
-      std::set<int> changed_blocks;
-      float val_threshold = 0.0;
-      int changed_coeffs = 0;
-      int est_jpg_size = prev_size;
-      for (size_t i = 0; i < global_order.size(); ++i) {
-        const int block_ix = global_order[i].first;
-        const int block_x = block_ix % block_width;
-        const int block_y = block_ix / block_width;
-        const int last_idx = last_indexes[block_ix];
-        const int offset = candidate_coeff_offsets[block_ix];
-        const uint8_t* candidates = &candidate_coeffs[offset];
-        const int idx = candidates[last_idx + std::min(direction, 0)];
-        const int c = idx / kDCTBlockSize;
-        const int k = idx % kDCTBlockSize;
-        const int* quant = img->component(c).quant();
-        const JPEGComponent& comp = jpg.components[c];
-        const int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
-        const int newval = direction > 0 ? 0 : Quantize(
-            comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]);
-        coeff_t block[kDCTBlockSize] = { 0 };
-        img->component(c).GetCoeffBlock(block_x, block_y, block);
-        UpdateACHistogram(-1, block, quant, &ac_histograms[c]);
-        block[k] = newval;
-        UpdateACHistogram(1, block, quant, &ac_histograms[c]);
-        img->component(c).SetCoeffBlock(block_x, block_y, block);
-        last_indexes[block_ix] += direction;
-        changed_blocks.insert(block_ix);
-        val_threshold = global_order[i].second;
-        ++changed_coeffs;
-        static const int kEntropyCodeUpdateFreq = 10;
-        if (i % kEntropyCodeUpdateFreq == 0) {
-          ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
-        }
-        est_jpg_size = jpg_header_size + dc_size + ac_histogram_size +
-            EntropyCodedDataSize(ac_histograms, ac_depths);
-        if (changed_coeffs > min_coeffs_to_change &&
-            std::abs(est_jpg_size - prev_size) > min_size_delta) {
-          break;
-        }
-      }
-      size_t global_order_size = global_order.size();
-      std::vector<std::pair<int, float>>().swap(global_order);
+            std::set<int> changed_blocks;
+            float val_threshold = 0.0;
+            int changed_coeffs = 0;
+            int est_jpg_size = prev_size;
+            for (size_t i = 0; i < global_order.size(); ++i) {
+                const int block_ix = global_order[i].first;
+                const int block_x = block_ix % block_width;
+                const int block_y = block_ix / block_width;
+                const int last_idx = last_indexes[block_ix];
+                const int offset = candidate_coeff_offsets[block_ix];
+                const uint8_t* candidates = &candidate_coeffs[offset];
+                const int idx = candidates[last_idx + std::min(direction, 0)];
+                const int c = idx / kDCTBlockSize;
+                const int k = idx % kDCTBlockSize;
+                const int* quant = img->component(c).quant();
+                const JPEGComponent& comp = jpg.components[c];
+                const int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+                const int newval = direction > 0 ? 0 : Quantize(
+                    comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]);
+                coeff_t block[kDCTBlockSize] = { 0 };
+                img->component(c).GetCoeffBlock(block_x, block_y, block);
+                UpdateACHistogram(-1, block, quant, &ac_histograms[c]);
+                block[k] = newval;
+                UpdateACHistogram(1, block, quant, &ac_histograms[c]);
+                img->component(c).SetCoeffBlock(block_x, block_y, block);
+                last_indexes[block_ix] += direction;
+                changed_blocks.insert(block_ix);
+                val_threshold = global_order[i].second;
+                ++changed_coeffs;
+                static const int kEntropyCodeUpdateFreq = 10;
+                if (i % kEntropyCodeUpdateFreq == 0) {
+                    ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
+                }
+                est_jpg_size = jpg_header_size + dc_size + ac_histogram_size +
+                    EntropyCodedDataSize(ac_histograms, ac_depths);
+                if (changed_coeffs > min_coeffs_to_change &&
+                    std::abs(est_jpg_size - prev_size) > min_size_delta) {
+                    break;
+                }
+            }
+            size_t global_order_size = global_order.size();
+            std::vector<std::pair<int, float>>().swap(global_order);
 
-      for (int i = 0; i < num_blocks; ++i) {
-        max_block_error[i] += block_weight[i] * val_threshold * direction;
-      }
+            for (int i = 0; i < num_blocks; ++i) {
+                max_block_error[i] += block_weight[i] * val_threshold * direction;
+            }
 
-      ++stats_->counters[kNumItersCnt];
-      ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt];
-      std::string encoded_jpg;
-      {
-        JPEGData jpg_out = jpg;
-        img->SaveToJpegData(&jpg_out);
-        OutputJpeg(jpg_out, &encoded_jpg);
-      }
-      GUETZLI_LOG(stats_,
-                  "Iter %2d: %s(%d) %s Coeffs[%d/%zd] "
-                  "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]",
-                  stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(),
-                  comp_mask, direction > 0 ? "up" : "down", changed_coeffs,
-                  global_order_size, changed_blocks.size(),
-                  blocks_to_change, num_blocks, val_threshold,
-                  encoded_jpg.size(),
-                  100.0 - (100.0 * est_jpg_size) / encoded_jpg.size());
-      comparator_->Compare(*img);
-      MaybeOutput(encoded_jpg);
-      prev_size = est_jpg_size;
+            ++stats_->counters[kNumItersCnt];
+            ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt];
+            std::string encoded_jpg;
+            {
+                JPEGData jpg_out = jpg;
+                img->SaveToJpegData(&jpg_out);
+                OutputJpeg(jpg_out, &encoded_jpg);
+            }
+            GUETZLI_LOG(stats_,
+                "Iter %2d: %s(%d) %s Coeffs[%d/%zd] "
+                "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]",
+                stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(),
+                comp_mask, direction > 0 ? "up" : "down", changed_coeffs,
+                global_order_size, changed_blocks.size(),
+                blocks_to_change, num_blocks, val_threshold,
+                encoded_jpg.size(),
+                100.0 - (100.0 * est_jpg_size) / encoded_jpg.size());
+            comparator_->Compare(*img);
+            MaybeOutput(encoded_jpg);
+            prev_size = est_jpg_size;
+        }
     }
-  }
 }
 
 bool IsGrayscale(const JPEGData& jpg) {

From dae16735416e0d0a31f9cf7a1125a7f0c19def33 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 15 May 2017 02:47:42 +0800
Subject: [PATCH 068/189] =?UTF-8?q?=E5=BB=BA=E7=AB=8Bcl=E7=AB=AF=E7=9A=84?=
 =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=8C=96ComputeZeroingOrder=E9=83=BD?=
 =?UTF-8?q?=E6=9C=89=E5=93=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl             | 229 ++++++++++++++++++++++++++++-
 clguetzli/clguetzli_comparator.cpp | 103 +++++++++++--
 clguetzli/clguetzli_comparator.h   |  13 +-
 guetzli/processor.cc               |  10 +-
 4 files changed, 330 insertions(+), 25 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 9d132ae8..906b2149 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -8,6 +8,7 @@
 
 #define kBlockEdge 8
 #define kBlockSize (kBlockEdge * kBlockEdge)
+#define kDCTBlockSize (kBlockEdge * kBlockEdge)
 #define kBlockEdgeHalf  (kBlockEdge / 2)
 #define kBlockHalf (kBlockEdge * kBlockEdgeHalf)
 
@@ -1413,4 +1414,230 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *
     *valx = a0 * r - a1 * g;
     *valy = a2 * r + a3 * g;
     *valz = b;
-}
\ No newline at end of file
+}
+
+
+///==================================================
+typedef struct __IntFloatPair
+{
+    int   idx;
+    float err;
+}IntFloatPair, DCTScoreData, CoeffData;
+
+typedef int16 coeff_t;
+
+typedef struct __IntFloatPairList
+{
+    int size;
+    IntFloatPair *pData;
+}IntFloatPairList;
+
+// chrisk todo
+// return size
+int list_push_back(IntFloatPairList* list, int i, float f)
+{
+
+}
+
+// chrisk todo
+// remove idx and return size
+int list_erase(IntFloatPairList* list, int idx)
+{
+}
+
+// chrisk todo
+int SortInputOrder(DCTScoreData* input_order, int size)
+{
+/*
+    std::sort(input_order.begin(), input_order.end(),
+        [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
+        return a.second < b.second; });
+*/
+}
+
+// chrisk todo
+// return the count of Non-zero item
+int MakeInputOrder(__global coeff_t *orig_block, DCTScoreData *input_order, int size)
+{
+/*
+    static const double kWeight[3] = { 1.0, 0.22, 0.20 };
+#include "guetzli/order.inc"
+    std::vector<std::pair<int, float> > input_order;
+    for (int c = 0; c < 3; ++c) {
+        if (!(comp_mask & (1 << c))) continue;
+        for (int k = 1; k < kDCTBlockSize; ++k) {
+            int idx = c * kDCTBlockSize + k;
+            if (block[idx] != 0) {
+                float score;
+                if (params_.new_zeroing_model) {
+                    score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
+                }
+                else {
+                    score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) *
+                        kWeight[c] / oldCsf[k]);
+                }
+                input_order.push_back(std::make_pair(idx, score));
+            }
+        }
+    }
+*/
+    return SortInputOrder(input_order, size);
+}
+
+// chrisk todo
+void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
+{
+
+}
+
+// ian todo
+void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio)
+{
+}
+
+
+// ian todo
+void OpsinDynamicsImageBlock(float *r, float *g, float *b,
+                            float *r_blurred, float *g_blurred, float *b_blurred,
+                            int size)
+{
+
+}
+
+// strong todo
+void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
+    float *xyb1_x, float *xyb1_y, float *xyb1_b,
+    float *c0_x, float *c0_y, float *c0_b,
+    float *c1_x, float *c1_y, float *c1_b,
+    int xsize, int ysize)
+{
+}
+
+// strong todo
+float CompareBlockEx(coeff_t *candidate_block, float* orig_image_block, float* mask_scale_block)
+{
+    float image_block[3 * kDCTBlockSize];
+    float *r1 = image_block;
+    float *g1 = &image_block[kDCTBlockSize];
+    float *b1 = &image_block[2 * kDCTBlockSize];
+    BlockToImage(candidate_block, r1, g1, b1);
+
+    float *r0 = orig_image_block;
+    float *g0 = &orig_image_block[kDCTBlockSize];
+    float *b0 = &orig_image_block[2 * kDCTBlockSize];
+
+    float *cr0, *cg0, *cb0;
+    float *cr1, *cg1, *cb1;
+
+    float *r0_blurred, *g0_blurred, *b0_blurred;
+    float *r1_blurred, *g1_blurred, *b1_blurred;
+
+    //BlurEx(r0,..
+    //BlurEx
+    //BlurEx
+    //BlurEx.
+    OpsinDynamicsImageBlock(r0, g0, b0, r0_blurred, g0_blurred, b0_blurred, kDCTBlockSize);
+    OpsinDynamicsImageBlock(r1, g1, b1, r1_blurred, g1_blurred, b1_blurred, kDCTBlockSize);
+
+    MaskHighIntensityChangeBlock(r0, g0, b0, r1, g1, b1, cr0, cg0, cb0, cr1, cg1, cb1, 8, 8);
+    {
+        double b0[3 * kDCTBlockSize];
+        double b1[3 * kDCTBlockSize];
+        /*
+            for (int c = 0; c < 3; ++c) {
+                for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+                    b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+                    b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+                }
+            }
+        */
+        double diff_xyz_dc[3] = { 0.0 };
+        double diff_xyz_ac[3] = { 0.0 };
+        double diff_xyz_edge_dc[3] = { 0.0 };
+
+        ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+        double diff = 0.0;
+        double diff_edge = 0.0;
+        /*
+            for (int c = 0; c < 3; ++c) {
+                diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
+                diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
+                diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
+            }
+            const double kEdgeWeight = 0.05;
+            return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+        */
+    }
+    return 0;
+}
+
+// strong todo
+__kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/,
+                                         __global coeff_t *block_list/*in*/,
+                                         __global float *orig_image/*in*/,
+                                         __global CoeffData *output_order_list/*out*/)
+{
+    int block_idx = get_global_id(0);
+
+    __global coeff_t *orig_block = orig_block_list + block_idx * kBlockSize;
+    __global coeff_t *block      = block_list + block_idx * kBlockSize;
+
+    DCTScoreData input_order_data[kBlockSize];
+    CoeffData    output_order_data[kBlockSize];
+
+    MakeInputOrder(orig_block, input_order_data, kBlockSize);
+    IntFloatPairList input_order = { kBlockSize, input_order_data };
+    IntFloatPairList output_order = { kBlockSize, output_order_data };
+
+
+    coeff_t processed_block[kBlockSize];
+ //   memcpy(processed_block, block, sizeof(processed_block);
+
+    while (input_order.size > 0)
+    {
+        float best_err = 1e17f;
+        int best_i = 0;
+        for (int i = 0; i < min(3, input_order.size); i++)
+        {
+            coeff_t candidate_block[kBlockSize];
+            // memcpy(candidate_block, processed_block, sizeof(candidate_block);
+
+            const int idx = input_order.pData[i].idx;
+
+            candidate_block[idx] = 0;
+
+            float max_err = CompareBlockEx(candidate_block, 0, 0);
+            if (max_err < best_err)
+            {
+                best_err = max_err;
+                best_i = i;
+            }
+        }
+
+        int idx = input_order.pData[best_i].idx;
+        processed_block[idx] = 0;
+        list_erase(&input_order, best_i);
+
+        list_push_back(&output_order, idx, best_err);
+    }
+    // ע��output_order�����resize���ǰ�β������λ0
+/*
+    // TOBEREMOVE:�����Ƴ�err������error���Ƶ���أ�����ԭ�Ա�ͼ��ԭʼֵ��
+    // Make the block error values monotonic.
+    float min_err = 1e10;
+    for (int i = output_order->size() - 1; i >= 0; --i) {
+    min_err = std::min(min_err, (*output_order)[i].block_err);
+    (*output_order)[i].block_err = min_err;
+    }
+    // Cut off at the block error limit.
+    size_t num = 0;
+    while (num < output_order->size() &&
+    (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) {
+    ++num;
+    }
+    output_order->resize(num);
+*/
+
+    // memcpy(output_data_list + block_idx * kBlockSize
+}
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 5970e6d8..3babe180 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -309,26 +309,89 @@ namespace guetzli
 	void ButteraugliComparatorEx::StartBlockComparisons()
 	{
 		ButteraugliComparator::StartBlockComparisons();
+
+        const int width = width_;
+        const int height = height_;
+        const int factor_x = 1;
+        const int factor_y = 1;
+
+        const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+        const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+        const int num_blocks = block_width * block_height;
+
+        const double* lut = Srgb8ToLinearTable();
+
+        imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize);
+        imgMaskXyzScaleBlockList.resize(num_blocks * 3);
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y)
+        {
+            for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix)
+            {
+                float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+                float* curG = curR + kDCTBlockSize;
+                float* curB = curG + kDCTBlockSize;
+
+                for (int iy = 0, i = 0; iy < 8; ++iy) {
+                    for (int ix = 0; ix < 8; ++ix, ++i) {
+                        int x = std::min(8 * block_x + ix, width - 1);
+                        int y = std::min(8 * block_y + iy, height - 1);
+                        int px = y * width + x;
+
+                        curR[i] = lut[rgb_orig_[3 * px]];
+                        curG[i] = lut[rgb_orig_[3 * px + 1]];
+                        curB[i] = lut[rgb_orig_[3 * px + 2]];
+                    }
+                }
+
+                int xmin = block_x * 8;
+                int ymin = block_y * 8;
+
+                imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin];
+                imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin];
+                imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin];
+            }
+        }
+
+
 	}
 
+    void ButteraugliComparatorEx::FinishBlockComparisons() {
+        ButteraugliComparator::FinishBlockComparisons();
+
+        imgOpsinDynamicsBlockList.clear();
+        imgMaskXyzScaleBlockList.clear();
+    }
+
 	void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
 	{
+        block_x_ = block_x;
+        block_y_ = block_y;
+        factor_x_ = factor_x;
+        factor_y_ = factor_y;
+        return;
 		ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
 	}
 
 	double ButteraugliComparatorEx::CompareBlockEx(coeff_t* candidate_block)
 	{
-	int block_x = block_x_ * factor_x_;
-		int block_y = block_y_ * factor_y_;
-		int xmin = 8 * block_x;
-		int ymin = 8 * block_y;
-		int block_ix = 0;
-		const std::vector<std::vector<float> >& rgb0_c = per_block_pregamma_[block_ix];
+        int block_ix = getCurrentBlockIdx();
+
+        float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+
+        // ����ڴ濽�����Ż�������������
+        std::vector< std::vector<float> > rgb0_c;
+        rgb0_c.resize(3);
+        for (int i = 0; i < 3; i++)
+        {
+            rgb0_c[i].resize(kDCTBlockSize);
+            memcpy(rgb0_c[i].data(), block_opsin + i*kDCTBlockSize, kDCTBlockSize * sizeof(float));
+        }
 
         //
 		std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
 		BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data());
 
+        ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c);
 		::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
 
 		std::vector<std::vector<float> > rgb0 = rgb0_c;
@@ -349,20 +412,28 @@ namespace guetzli
 		double diff_xyz_edge_dc[3] = { 0.0 };
 		::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
 
-		double scale[3];
-		for (int c = 0; c < 3; ++c) {
-			scale[c] = mask_xyz_[c][ymin * width_ + xmin];
-		}
-
-		static const double kEdgeWeight = 0.05;
-
 		double diff = 0.0;
 		double diff_edge = 0.0;
 		for (int c = 0; c < 3; ++c) {
-			diff += diff_xyz_dc[c] * scale[c];
-			diff += diff_xyz_ac[c] * scale[c];
-			diff_edge += diff_xyz_edge_dc[c] * scale[c];
+            diff      += diff_xyz_dc[c]      * imgMaskXyzScaleBlockList[block_ix * 3 + c];
+            diff      += diff_xyz_ac[c]      * imgMaskXyzScaleBlockList[block_ix * 3 + c];
+            diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
 		}
+        const double kEdgeWeight = 0.05;
 		return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
 	}
+
+
+    int ButteraugliComparatorEx::getCurrentBlockIdx(void)
+    {
+        const int width = width_;
+        const int height = height_;
+        const int factor_x = 1;
+        const int factor_y = 1;
+
+        const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+        const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+
+        return block_y_ * block_width + block_x_;
+    }
 }
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
index 96642ce6..353eff59 100644
--- a/clguetzli/clguetzli_comparator.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -11,15 +11,16 @@ namespace guetzli {
 			const std::vector<uint8_t>* rgb,
 			const float target_distance, ProcessStats* stats);
 
-        //void Compare(const OutputImage& img) override;
-
-		void StartBlockComparisons();
-		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y);
+		void StartBlockComparisons() override;
+        void FinishBlockComparisons() override;
+		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
 
 		double CompareBlockEx(coeff_t* candidate_block);
-
+    private:
+        int getCurrentBlockIdx(void);
 	protected:
-		std::vector<float> imgOpsinDynamicsBlockList;
+		std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
+        std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
 	};
 
 }
\ No newline at end of file
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index d1607c14..c4d43bef 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -613,7 +613,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
 
             std::vector<CoeffData> block_order;
 
-            ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
+           ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
 
             // ���´�����Ȼû��batch���������ȼ�������������
             candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
@@ -626,6 +626,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
 
     //
     comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
+    candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
 
     SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early,
         candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
@@ -665,6 +666,10 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
     }
     std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
 
+    if (input_order.size() > 10)
+    {
+        int i = 0;
+    }
     coeff_t processed_block[kBlockSize];
     memcpy(processed_block, block, sizeof(processed_block));
 
@@ -1060,7 +1065,8 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     img.ApplyGlobalQuantization(best_q);
 
     if (!downsample) {
-      SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
+      //SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
+        SelectFrequencyMaskingBatch(jpg, &img, 1.0, false);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
       SelectFrequencyMasking(jpg, &img, 1, ymul, false);

From 148927eb42889308670c3ac0ff390acb8f3f012a Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 15 May 2017 10:05:12 +0800
Subject: [PATCH 069/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=88=86=E5=B7=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 906b2149..d259327a 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1493,6 +1493,7 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
 // ian todo
 void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio)
 {
+    // �ο�clBlurEx2��ʵ�֣�sigma = 1.1����ʱstep��diff�����ػ�Ϊ�̶�ֵ
 }
 
 
@@ -1504,7 +1505,7 @@ void OpsinDynamicsImageBlock(float *r, float *g, float *b,
 
 }
 
-// strong todo
+// chrisk todo
 void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     float *xyb1_x, float *xyb1_y, float *xyb1_b,
     float *c0_x, float *c0_y, float *c0_b,

From 55f60a4a40ade3cf015d096f36f1f94eae9ea15c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 15 May 2017 10:20:10 +0800
Subject: [PATCH 070/189] =?UTF-8?q?=E5=88=86=E9=85=8D=E5=B7=A5=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index d259327a..9db59cf1 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1418,6 +1418,9 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *
 
 
 ///==================================================
+// ��λ��������Щ��������Ϊ��ʵ��ButteraugliComparatorEx::CompareBlockEx
+
+// IntFloatPair��Ϊ��ģ��output_order input_order��vector�����Ǵ�С�̶�Ϊ8x8
 typedef struct __IntFloatPair
 {
     int   idx;
@@ -1487,7 +1490,7 @@ int MakeInputOrder(__global coeff_t *orig_block, DCTScoreData *input_order, int
 // chrisk todo
 void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
 {
-
+    // �ο�clguetzli_comparator.cpp : BlockToImage
 }
 
 // ian todo

From 16e27abc880cd8bee7858683317d8992d50d87a1 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 15 May 2017 16:29:51 +0800
Subject: [PATCH 071/189] clComputeBlockZeroingOrder CompareBlockEx

---
 clguetzli/clguetzli.cl | 189 +++++++++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 73 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 9db59cf1..7e8aabc6 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1494,7 +1494,8 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
 }
 
 // ian todo
-void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio)
+// �����������output
+void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
 {
     // �ο�clBlurEx2��ʵ�֣�sigma = 1.1����ʱstep��diff�����ػ�Ϊ�̶�ֵ
 }
@@ -1517,75 +1518,111 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
 {
 }
 
+void func(float *r, float *g, float *b, float *r_blurred, float *g_blurred, float *b_blurred)
+{
+    //BlurEx(r, g, b, r_blurred, g_blurred,
+}
+
+typedef union ocl_channels_t
+{
+    struct
+    {
+        float * r;
+        float * g;
+        float * b;
+    };
+
+    float *ch[3];
+}ocl_channels;
+
+void floatcopy(float *dst, float *src, int size)
+{
+    for (int i = 0; i < size; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+void CalcOpsinDynamicsImage(ocl_channels rgb)
+{
+    float rgb_blurred[3][kDCTBlockSize];
+    for (int i = 0; i < 3; i++)
+    {
+        BlurEx(rgb.ch[i], 8, 8, 1.1, 0, rgb_blurred[i]);
+    }
+    OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
+}
+
 // strong todo
-float CompareBlockEx(coeff_t *candidate_block, float* orig_image_block, float* mask_scale_block)
+float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, __global float* mask_scale_block)
 {
-    float image_block[3 * kDCTBlockSize];
-    float *r1 = image_block;
-    float *g1 = &image_block[kDCTBlockSize];
-    float *b1 = &image_block[2 * kDCTBlockSize];
-    BlockToImage(candidate_block, r1, g1, b1);
-
-    float *r0 = orig_image_block;
-    float *g0 = &orig_image_block[kDCTBlockSize];
-    float *b0 = &orig_image_block[2 * kDCTBlockSize];
-
-    float *cr0, *cg0, *cb0;
-    float *cr1, *cg1, *cb1;
-
-    float *r0_blurred, *g0_blurred, *b0_blurred;
-    float *r1_blurred, *g1_blurred, *b1_blurred;
-
-    //BlurEx(r0,..
-    //BlurEx
-    //BlurEx
-    //BlurEx.
-    OpsinDynamicsImageBlock(r0, g0, b0, r0_blurred, g0_blurred, b0_blurred, kDCTBlockSize);
-    OpsinDynamicsImageBlock(r1, g1, b1, r1_blurred, g1_blurred, b1_blurred, kDCTBlockSize);
-
-    MaskHighIntensityChangeBlock(r0, g0, b0, r1, g1, b1, cr0, cg0, cb0, cr1, cg1, cb1, 8, 8);
+    float rgb0[3][kDCTBlockSize];
+    float rgb1[3][kDCTBlockSize];
     {
-        double b0[3 * kDCTBlockSize];
-        double b1[3 * kDCTBlockSize];
-        /*
-            for (int c = 0; c < 3; ++c) {
-                for (int ix = 0; ix < kDCTBlockSize; ++ix) {
-                    b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
-                    b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
-                }
-            }
-        */
-        double diff_xyz_dc[3] = { 0.0 };
-        double diff_xyz_ac[3] = { 0.0 };
-        double diff_xyz_edge_dc[3] = { 0.0 };
-
-        ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
-
-        double diff = 0.0;
-        double diff_edge = 0.0;
-        /*
-            for (int c = 0; c < 3; ++c) {
-                diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
-                diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
-                diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
-            }
-            const double kEdgeWeight = 0.05;
-            return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
-        */
+        float rgb0_data[3*kDCTBlockSize];
+        ocl_channels rgb0_c = { rgb0_data, &rgb0_data[kDCTBlockSize], &rgb0_data[2 * kDCTBlockSize] };
+        for (int i = 0; i < 3*kDCTBlockSize; i++)
+        {
+            rgb0_data[i] = orig_image_block[i];
+        }
+
+        float image_block[3 * kDCTBlockSize];
+        ocl_channels rgb1_c = { image_block, &image_block[kDCTBlockSize], &image_block[2 * kDCTBlockSize] };
+        BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b);
+
+        CalcOpsinDynamicsImage(rgb0_c);
+        CalcOpsinDynamicsImage(rgb1_c);
+
+        floatcopy(rgb0, rgb0_data, 3 * kDCTBlockSize);
+        floatcopy(rgb1, image_block, 3 * kDCTBlockSize);
+
+        MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2],
+                                     rgb1[0], rgb1[1], rgb1[2],
+                                    rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2],
+                                     rgb1_c.ch[0], rgb1_c.ch[1], rgb1_c.ch[2],
+                                     8, 8);
+
+    }
+
+    // ����ΪɶҪ��floatת��double���ܼ��������㣿
+    double b0[3 * kDCTBlockSize];       //
+    double b1[3 * kDCTBlockSize];
+    for (int c = 0; c < 3; ++c) {
+        for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+            b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+            b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+        }
     }
-    return 0;
+
+    double diff_xyz_dc[3] = { 0.0 };
+    double diff_xyz_ac[3] = { 0.0 };
+    double diff_xyz_edge_dc[3] = { 0.0 };
+    ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+    double diff = 0.0;
+    double diff_edge = 0.0;
+
+    for (int c = 0; c < 3; ++c) {
+        diff += diff_xyz_dc[c] * mask_scale_block[c];
+        diff += diff_xyz_ac[c] * mask_scale_block[c];
+        diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c];
+    }
+    const double kEdgeWeight = 0.05;
+    return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
 }
 
 // strong todo
 __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/,
                                          __global coeff_t *block_list/*in*/,
                                          __global float *orig_image/*in*/,
+                                         __global float *mask_scale/*in*/,
                                          __global CoeffData *output_order_list/*out*/)
 {
     int block_idx = get_global_id(0);
 
     __global coeff_t *orig_block = orig_block_list + block_idx * kBlockSize;
     __global coeff_t *block      = block_list + block_idx * kBlockSize;
+    __global float* orig_image_block = orig_image + block_idx * kBlockSize;
 
     DCTScoreData input_order_data[kBlockSize];
     CoeffData    output_order_data[kBlockSize];
@@ -1594,9 +1631,10 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
     IntFloatPairList input_order = { kBlockSize, input_order_data };
     IntFloatPairList output_order = { kBlockSize, output_order_data };
 
-
     coeff_t processed_block[kBlockSize];
- //   memcpy(processed_block, block, sizeof(processed_block);
+    for (int i = 0; i < kBlockSize; i++) {
+        processed_block[i] = block[i];
+    }
 
     while (input_order.size > 0)
     {
@@ -1605,13 +1643,15 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
         for (int i = 0; i < min(3, input_order.size); i++)
         {
             coeff_t candidate_block[kBlockSize];
-            // memcpy(candidate_block, processed_block, sizeof(candidate_block);
+            for (int i = 0; i < kBlockSize; i++) {
+                candidate_block[i] = processed_block[i];
+            }
 
             const int idx = input_order.pData[i].idx;
 
             candidate_block[idx] = 0;
 
-            float max_err = CompareBlockEx(candidate_block, 0, 0);
+            float max_err = CompareBlockEx(candidate_block, orig_image_block, mask_scale + block_idx * 3);
             if (max_err < best_err)
             {
                 best_err = max_err;
@@ -1626,22 +1666,25 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
         list_push_back(&output_order, idx, best_err);
     }
     // ע��output_order�����resize���ǰ�β������λ0
-/*
-    // TOBEREMOVE:�����Ƴ�err������error���Ƶ���أ�����ԭ�Ա�ͼ��ԭʼֵ��
-    // Make the block error values monotonic.
     float min_err = 1e10;
-    for (int i = output_order->size() - 1; i >= 0; --i) {
-    min_err = std::min(min_err, (*output_order)[i].block_err);
-    (*output_order)[i].block_err = min_err;
+    for (int i = output_order.size - 1; i >= 0; --i) {
+        min_err = min(min_err, output_order.pData[i].err);
+        output_order.pData[i].err = min_err;
     }
-    // Cut off at the block error limit.
-    size_t num = 0;
-    while (num < output_order->size() &&
-    (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) {
-    ++num;
-    }
-    output_order->resize(num);
-*/
 
-    // memcpy(output_data_list + block_idx * kBlockSize
+    __global CoeffData *output_block = output_order_list + block_idx * kBlockSize;
+
+    for (int i = 0; i < kBlockSize; i++)
+    {
+        if (i > output_order.size)
+        {
+            output_block[i].idx = 0;
+            output_block[i].err = 0;
+        }
+        else
+        {
+            output_block[i].idx = output_order.pData[i].idx;
+            output_block[i].err = output_order.pData[i].err;
+        }
+    }
 }

From 87b462ac378463f3a372bfb2917835ac190b495d Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 15 May 2017 16:47:07 +0800
Subject: [PATCH 072/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3n=E5=8D=A1=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=E5=85=BC=E5=AE=B9=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 7e8aabc6..b33e98ce 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1439,18 +1439,20 @@ typedef struct __IntFloatPairList
 // return size
 int list_push_back(IntFloatPairList* list, int i, float f)
 {
-
+    return 0;
 }
 
 // chrisk todo
 // remove idx and return size
 int list_erase(IntFloatPairList* list, int idx)
 {
+    return 0;
 }
 
 // chrisk todo
 int SortInputOrder(DCTScoreData* input_order, int size)
 {
+    return 0;
 /*
     std::sort(input_order.begin(), input_order.end(),
         [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
@@ -1518,11 +1520,6 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
 {
 }
 
-void func(float *r, float *g, float *b, float *r_blurred, float *g_blurred, float *b_blurred)
-{
-    //BlurEx(r, g, b, r_blurred, g_blurred,
-}
-
 typedef union ocl_channels_t
 {
     struct
@@ -1560,14 +1557,20 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block,
     float rgb1[3][kDCTBlockSize];
     {
         float rgb0_data[3*kDCTBlockSize];
-        ocl_channels rgb0_c = { rgb0_data, &rgb0_data[kDCTBlockSize], &rgb0_data[2 * kDCTBlockSize] };
+        ocl_channels rgb0_c;
+        rgb0_c.r = &rgb0_data[0];
+        rgb0_c.g = &rgb0_data[kDCTBlockSize];
+        rgb0_c.b = &rgb0_data[2 * kDCTBlockSize];
         for (int i = 0; i < 3*kDCTBlockSize; i++)
         {
             rgb0_data[i] = orig_image_block[i];
         }
 
         float image_block[3 * kDCTBlockSize];
-        ocl_channels rgb1_c = { image_block, &image_block[kDCTBlockSize], &image_block[2 * kDCTBlockSize] };
+        ocl_channels rgb1_c;
+        rgb1_c.r = &image_block[0];
+        rgb1_c.g = &image_block[kDCTBlockSize];
+        rgb1_c.b = &image_block[2 * kDCTBlockSize];
         BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b);
 
         CalcOpsinDynamicsImage(rgb0_c);

From 092557922bb8dcc865846f4c73ccc309d6d24e32 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Mon, 15 May 2017 19:55:49 +0800
Subject: [PATCH 073/189] Implement part of BlurEx

---
 clguetzli/clguetzli.cl | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index b33e98ce..f619b76b 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1495,11 +1495,57 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
     // �ο�clguetzli_comparator.cpp : BlockToImage
 }
 
+void Convolution(__global float* multipliers, __global float* inp, __global float* result,
+    size_t xsize, size_t ysize, int xstep, int len, int offset, float border_ratio)
+{
+	float weight_no_border = 0;
+
+	for (size_t j = 0; j <= 2 * offset; ++j) {
+		weight_no_border += multipliers[j];
+	}
+	for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) {
+		int minx = x < offset ? 0 : x - offset;
+		int maxx = min(xsize, x + len - offset) - 1;
+		float weight = 0.0;
+		for (int j = minx; j <= maxx; ++j) {
+			weight += multipliers[j - x + offset];
+		}
+		// Interpolate linearly between the no-border scaling and border scaling.
+		weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+		float scale = 1.0 / weight;
+		for (size_t y = 0; y < ysize; ++y) {
+			float sum = 0.0;
+			for (int j = minx; j <= maxx; ++j) {
+				sum += inp[y * xsize + j] * multipliers[j - x + offset];
+			}
+			result[ox * ysize + y] = (float)(sum * scale);
+		}
+	}
+}
+
 // ian todo
 // �����������output
 void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
 {
     // �ο�clBlurEx2��ʵ�֣�sigma = 1.1����ʱstep��diff�����ػ�Ϊ�̶�ֵ
+	const double sigma = 1.1;
+	double m = 2.25;  // Accuracy increases when m is increased.
+	const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772
+	const int diff = 2;  // when sigma=1.1, diff's value is 2.
+	const int expn_size = 5; // when sigma=1.1, scaler is  5
+	float expn[5] = { exp(scaler * (-diff) * (-diff)),
+							  exp(scaler * (-diff + 1) * (-diff + 1)),
+							  exp(scaler * (-diff + 2) * (-diff + 2)),
+							  exp(scaler * (-diff + 3) * (-diff + 3)),
+							  exp(scaler * (-diff + 4) * (-diff + 4))};
+	const int xstep = 1; // when sigma=1.1, xstep is 1.
+	/*
+	Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
+              border_ratio,
+              tmp.data());
+	Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
+              border_ratio, output);
+			  */
 }
 
 

From b3455dd3a735614d8667cf66fc8a7b338a7aa8b3 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 16 May 2017 02:53:50 +0800
Subject: [PATCH 074/189] Merge remote-tracking branch 'origin/master'

---
 clguetzli/clguetzli.cl           | 517 +++++++++++++++++++++++++++++--
 clguetzli/clguetzli.cpp          |  52 ++++
 clguetzli/clguetzli.h            |   8 +
 clguetzli/clguetzli_comparator.h |   2 +-
 clguetzli/ocl.h                  |   1 +
 guetzli/processor.cc             |  59 +++-
 6 files changed, 606 insertions(+), 33 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index f619b76b..6e354874 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1439,20 +1439,41 @@ typedef struct __IntFloatPairList
 // return size
 int list_push_back(IntFloatPairList* list, int i, float f)
 {
-    return 0;
+	list->pData[list->size].idx = i;
+	list->pData[list->size].err = f;
+    return ++list->size;
 }
 
 // chrisk todo
 // remove idx and return size
 int list_erase(IntFloatPairList* list, int idx)
 {
-    return 0;
+	for (int i = idx; i < list->size - 1; i++)
+	{
+		list->pData[i].idx = list->pData[i + 1].idx;
+		list->pData[i].err = list->pData[i + 1].err;
+	}
+    return --list->size;
 }
 
 // chrisk todo
 int SortInputOrder(DCTScoreData* input_order, int size)
 {
-    return 0;
+	int i, j;
+	DCTScoreData tmp;
+	for (j = 1; j < size; j++) {
+		tmp.idx = input_order[j].idx;
+		tmp.err = input_order[j].err;
+		i = j - 1;
+		while (i >= 0 && input_order[i].err > tmp.err) {
+			input_order[i + 1].idx = input_order[i].idx;
+			input_order[i + 1].err = input_order[i].err;
+			i--;
+		}
+		input_order[i + 1].idx = tmp.idx;
+		input_order[i + 1].err = tmp.err;
+	}
+    return size;
 /*
     std::sort(input_order.begin(), input_order.end(),
         [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
@@ -1460,10 +1481,412 @@ int SortInputOrder(DCTScoreData* input_order, int size)
 */
 }
 
+__constant static float csf[192] = {
+	0.0f,
+	1.71014f,
+	0.298711f,
+	0.233709f,
+	0.223126f,
+	0.207072f,
+	0.192775f,
+	0.161201f,
+	2.05807f,
+	0.222927f,
+	0.203406f,
+	0.188465f,
+	0.184668f,
+	0.169993f,
+	0.159142f,
+	0.130155f,
+	0.430518f,
+	0.204939f,
+	0.206655f,
+	0.192231f,
+	0.182941f,
+	0.169455f,
+	0.157599f,
+	0.127153f,
+	0.234757f,
+	0.191098f,
+	0.192698f,
+	0.17425f,
+	0.166503f,
+	0.142154f,
+	0.126182f,
+	0.104196f,
+	0.226117f,
+	0.185373f,
+	0.183825f,
+	0.166643f,
+	0.159414f,
+	0.12636f,
+	0.108696f,
+	0.0911974f,
+	0.207463f,
+	0.171517f,
+	0.170124f,
+	0.141582f,
+	0.126213f,
+	0.103627f,
+	0.0882436f,
+	0.0751848f,
+	0.196436f,
+	0.161947f,
+	0.159271f,
+	0.126938f,
+	0.109125f,
+	0.0878027f,
+	0.0749842f,
+	0.0633859f,
+	0.165232f,
+	0.132905f,
+	0.128679f,
+	0.105766f,
+	0.0906087f,
+	0.0751544f,
+	0.0641187f,
+	0.0529921f,
+	0.0f,
+	0.147235f,
+	0.11264f,
+	0.0757892f,
+	0.0493929f,
+	0.0280663f,
+	0.0075012f,
+	-0.000945567f,
+	0.149251f,
+	0.0964806f,
+	0.0786224f,
+	0.05206f,
+	0.0292758f,
+	0.00353094f,
+	-0.00277912f,
+	-0.00404481f,
+	0.115551f,
+	0.0793142f,
+	0.0623735f,
+	0.0405019f,
+	0.0152656f,
+	-0.00145742f,
+	-0.00370369f,
+	-0.00375106f,
+	0.0791547f,
+	0.0537506f,
+	0.0413634f,
+	0.0193486f,
+	0.000609066f,
+	-0.00510923f,
+	-0.0046452f,
+	-0.00385187f,
+	0.0544534f,
+	0.0334066f,
+	0.0153899f,
+	0.000539088f,
+	-0.00356085f,
+	-0.00535661f,
+	-0.00429145f,
+	-0.00343131f,
+	0.0356439f,
+	0.00865645f,
+	0.00165229f,
+	-0.00425931f,
+	-0.00507324f,
+	-0.00459083f,
+	-0.003703f,
+	-0.00310327f,
+	0.0121926f,
+	-0.0009259f,
+	-0.00330991f,
+	-0.00499378f,
+	-0.00437381f,
+	-0.00377427f,
+	-0.00311731f,
+	-0.00255125f,
+	-0.000320593f,
+	-0.00426043f,
+	-0.00416549f,
+	-0.00419364f,
+	-0.00365418f,
+	-0.00317499f,
+	-0.00255932f,
+	-0.00217917f,
+	0.0f,
+	0.143471f,
+	0.124336f,
+	0.0947465f,
+	0.0814066f,
+	0.0686776f,
+	0.0588122f,
+	0.0374415f,
+	0.146315f,
+	0.105334f,
+	0.0949415f,
+	0.0784241f,
+	0.0689064f,
+	0.0588304f,
+	0.0495961f,
+	0.0202342f,
+	0.123818f,
+	0.0952654f,
+	0.0860556f,
+	0.0724158f,
+	0.0628307f,
+	0.0529965f,
+	0.0353941f,
+	0.00815821f,
+	0.097054f,
+	0.080422f,
+	0.0731085f,
+	0.0636154f,
+	0.055606f,
+	0.0384127f,
+	0.0142879f,
+	0.00105195f,
+	0.0849312f,
+	0.071115f,
+	0.0631183f,
+	0.0552972f,
+	0.0369221f,
+	0.00798314f,
+	0.000716374f,
+	-0.00200948f,
+	0.0722298f,
+	0.0599559f,
+	0.054841f,
+	0.0387529f,
+	0.0107262f,
+	0.000355315f,
+	-0.00244803f,
+	-0.00335222f,
+	0.0635335f,
+	0.0514196f,
+	0.0406309f,
+	0.0125833f,
+	0.00151305f,
+	-0.00140269f,
+	-0.00362547f,
+	-0.00337649f,
+	0.0472024f,
+	0.0198725f,
+	0.0113437f,
+	0.00266305f,
+	-0.00137183f,
+	-0.00354158f,
+	-0.00341292f,
+	-0.00290074f
+};
+
+__constant static float bias[192] = {
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0f,
+	0.0
+};
+
 // chrisk todo
 // return the count of Non-zero item
-int MakeInputOrder(__global coeff_t *orig_block, DCTScoreData *input_order, int size)
+int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int size)
 {
+	/* ��һЩ���⣬��ע�͵�
+	for (int c = 0; c < 3; ++c) {
+		if (!(comp_mask & (1 << c))) continue;
+		for (int k = 1; k < size; ++k) {
+			int idx = c * size + k;
+			if (block[idx] != 0) {
+				float score = abs(orig_block[idx]) * csf[idx] + bias[idx];
+				list_push_back(input_order, idx, score);
+			}
+		}
+	}
+	*/
 /*
     static const double kWeight[3] = { 1.0, 0.22, 0.20 };
 #include "guetzli/order.inc"
@@ -1495,8 +1918,12 @@ void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
     // �ο�clguetzli_comparator.cpp : BlockToImage
 }
 
-void Convolution(__global float* multipliers, __global float* inp, __global float* result,
-    size_t xsize, size_t ysize, int xstep, int len, int offset, float border_ratio)
+void Convolution(size_t xsize, size_t ysize,
+                 int xstep, int len, int offset,
+                 float* multipliers,
+                 float* inp,
+                 float border_ratio,
+                 float* result)
 {
 	float weight_no_border = 0;
 
@@ -1539,13 +1966,15 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio,
 							  exp(scaler * (-diff + 3) * (-diff + 3)),
 							  exp(scaler * (-diff + 4) * (-diff + 4))};
 	const int xstep = 1; // when sigma=1.1, xstep is 1.
-	/*
-	Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
-              border_ratio,
-              tmp.data());
-	Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
+  const int ystep = xstep;
+
+  int dxsize = (xsize + xstep - 1) / xstep;
+  int dysize = (ysize + ystep - 1) / ystep;
+
+  float *tmp = 0; // TODO:need a tmp and
+	Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp);
+	Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp,
               border_ratio, output);
-			  */
 }
 
 
@@ -1554,7 +1983,29 @@ void OpsinDynamicsImageBlock(float *r, float *g, float *b,
                             float *r_blurred, float *g_blurred, float *b_blurred,
                             int size)
 {
-
+  for (size_t i = 0; i < size; ++i) {
+    double sensitivity[3];
+    {
+      // Calculate sensitivity[3] based on the smoothed image gamma derivative.
+      double pre_rgb[3] = { r_blurred[i], g_blurred[i], b_blurred[i] };
+      double pre_mixed[3];
+      OpsinAbsorbance(pre_rgb, pre_mixed);
+      sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+      sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+      sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+    }
+    double cur_rgb[3] = { r[i],  g[i],  b[i] };
+    double cur_mixed[3];
+    OpsinAbsorbance(cur_rgb, cur_mixed);
+    cur_mixed[0] *= sensitivity[0];
+    cur_mixed[1] *= sensitivity[1];
+    cur_mixed[2] *= sensitivity[2];
+    double x, y, z;
+    RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+    r[i] = (float)(x);
+    g[i] = (float)(y);
+    b[i] = (float)(z);
+  }
 }
 
 // chrisk todo
@@ -1597,6 +2048,9 @@ void CalcOpsinDynamicsImage(ocl_channels rgb)
 }
 
 // strong todo
+// candidate_block [R....R][G....G][B....B]
+// orig_image_block [RR..RRGG..GGBB..BB]
+// mask_scale[RGB]
 float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, __global float* mask_scale_block)
 {
     float rgb0[3][kDCTBlockSize];
@@ -1661,6 +2115,12 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block,
 }
 
 // strong todo
+// orig_block_list [R....R][G....G][B....B]
+// block_list [R....R][G....G][B....B]
+// orig_image [RR..RRGG..GGBB..BB]
+// mask_scale[RGB]
+// output_orlder_list [3 * kBlockSize]
+
 __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/,
                                          __global coeff_t *block_list/*in*/,
                                          __global float *orig_image/*in*/,
@@ -1668,20 +2128,21 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
                                          __global CoeffData *output_order_list/*out*/)
 {
     int block_idx = get_global_id(0);
+#define kComputeBlockSize (kBlockSize * 3)
 
-    __global coeff_t *orig_block = orig_block_list + block_idx * kBlockSize;
-    __global coeff_t *block      = block_list + block_idx * kBlockSize;
-    __global float* orig_image_block = orig_image + block_idx * kBlockSize;
+    __global coeff_t *orig_block     = orig_block_list + block_idx * kComputeBlockSize;
+    __global coeff_t *block          = block_list + block_idx * kComputeBlockSize;
+    __global float* orig_image_block = orig_image + block_idx * kComputeBlockSize;
 
-    DCTScoreData input_order_data[kBlockSize];
-    CoeffData    output_order_data[kBlockSize];
+    DCTScoreData input_order_data[kComputeBlockSize];
+    CoeffData    output_order_data[kComputeBlockSize];
 
-    MakeInputOrder(orig_block, input_order_data, kBlockSize);
-    IntFloatPairList input_order = { kBlockSize, input_order_data };
-    IntFloatPairList output_order = { kBlockSize, output_order_data };
+    int count = MakeInputOrder(block, orig_block, input_order_data, kComputeBlockSize);
+    IntFloatPairList input_order = { count, input_order_data };
+    IntFloatPairList output_order = { 0, output_order_data };
 
-    coeff_t processed_block[kBlockSize];
-    for (int i = 0; i < kBlockSize; i++) {
+    coeff_t processed_block[kComputeBlockSize];
+    for (int i = 0; i < kComputeBlockSize; i++) {
         processed_block[i] = block[i];
     }
 
@@ -1691,8 +2152,8 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
         int best_i = 0;
         for (int i = 0; i < min(3, input_order.size); i++)
         {
-            coeff_t candidate_block[kBlockSize];
-            for (int i = 0; i < kBlockSize; i++) {
+            coeff_t candidate_block[kComputeBlockSize];
+            for (int i = 0; i < kComputeBlockSize; i++) {
                 candidate_block[i] = processed_block[i];
             }
 
@@ -1721,11 +2182,11 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
         output_order.pData[i].err = min_err;
     }
 
-    __global CoeffData *output_block = output_order_list + block_idx * kBlockSize;
+    __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
 
-    for (int i = 0; i < kBlockSize; i++)
+    for (int i = 0; i < kComputeBlockSize; i++)
     {
-        if (i > output_order.size)
+        if (i >= output_order.size)
         {
             output_block[i].idx = 0;
             output_block[i].err = 0;
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 32d0d77b..b8c8ad2a 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -63,6 +63,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
 	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
 	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err);
 
 	return ocl;
 }
@@ -1192,3 +1193,54 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
 	clReleaseMemObject(mem_result);
 }
+
+void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list,
+                              float *orig_iamge, float* mask_scale, CoeffData *output_order_list,
+                              int size)
+{
+    using namespace guetzli;
+
+    int item_count = 3 * kDCTBlockSize * size;
+
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+
+    cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count);
+    cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count);
+    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count);
+    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size);
+    cl_mem mem_output_order_list = ocl.allocMem(sizeof(CoeffData) * item_count);
+
+    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER];
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_block_list);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_block_list);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_image);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mask_scale);
+    clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_output_order_list);
+
+    size_t globalWorkSize[1] = { size };
+    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    if (CL_SUCCESS != err)
+    {
+        LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+    }
+    err = clFinish(ocl.commandQueue);
+    if (CL_SUCCESS != err)
+    {
+        LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
+    }
+
+    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_list, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+    memcpy(output_order_list, result, sizeof(CoeffData) * item_count);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_list, result, sizeof(CoeffData) * item_count, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    clReleaseMemObject(mem_orig_block_list);
+    clReleaseMemObject(mem_block_list);
+    clReleaseMemObject(mem_orig_image);
+    clReleaseMemObject(mem_mask_scale);
+    clReleaseMemObject(mem_output_order_list);
+
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index aa595ab5..178dce27 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,10 +1,16 @@
 #pragma once
 #include "CL\cl.h"
+#include "guetzli\jpeg_data.h"
 #include "ocl.h"
 
 extern bool g_useOpenCL;
 extern bool g_checkOpenCL;
 
+struct CoeffData {
+    int idx;
+    float block_err;
+};
+
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
@@ -13,6 +19,8 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
+void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list, float *orig_iamge, float* mask_scale, CoeffData *output_order_list, int size);
+
 void clMask(const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2,
     size_t xsize, size_t ysize,
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
index 353eff59..97f23fb9 100644
--- a/clguetzli/clguetzli_comparator.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -18,7 +18,7 @@ namespace guetzli {
 		double CompareBlockEx(coeff_t* candidate_block);
     private:
         int getCurrentBlockIdx(void);
-	protected:
+	public:
 		std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
 	};
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index b9ada586..ae5ceeeb 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -63,6 +63,7 @@ enum KernelName {
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
 	KERNEL_EDGEDETECTORLOWFREQ,
+    KERNEL_COMPUTEBLOCKZERONGORDER,
 	KERNEL_COUNT,
 };
 
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index c4d43bef..eaf9f75b 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -23,6 +23,7 @@
 
 #include "guetzli/butteraugli_comparator.h"
 #include "clguetzli\clguetzli_comparator.h"
+#include "clguetzli\clguetzli.h"
 #include "guetzli/comparator.h"
 #include "guetzli/debug_print.h"
 #include "guetzli/fast_log.h"
@@ -38,11 +39,12 @@ namespace guetzli {
 namespace {
 
 static const size_t kBlockSize = 3 * kDCTBlockSize;
-
+/*
 struct CoeffData {
   int idx;
   float block_err;
 };
+*/
 struct QuantData {
   int q[3][kDCTBlockSize];
   size_t jpg_size;
@@ -601,10 +603,59 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
         }
     }
 
+    std::vector<CoeffData> output_order(num_blocks * kBlockSize);
+    ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
+
+    if (g_useOpenCL)
+    {
+        clComputeBlockZeroingOrder(orig_block_batch.data(),
+                                    block_batch.data(),
+                                    comp->imgOpsinDynamicsBlockList.data(),
+                                    comp->imgMaskXyzScaleBlockList.data(),
+                                    output_order.data(),
+                                    num_blocks);
+    }
+    else
+    {
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+            for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+                coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
+                coeff_t *block = &block_batch[block_ix * kBlockSize];
+
+                std::vector<CoeffData> block_order;
+
+                ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
+
+                CoeffData * p = &output_order[block_ix * kBlockSize];
+                for (int i = 0; i < block_order.size(); i++)
+                {
+                    p[i].idx = block_order[i].idx;
+                    p[i].block_err = block_order[i].block_err;
+                }
+            }
+        }
+    }
+
     std::vector<int> candidate_coeff_offsets(num_blocks + 1);
     std::vector<uint8_t> candidate_coeffs;
     std::vector<float> candidate_coeff_errors;
 
+    for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+        for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+            CoeffData * p = &output_order[block_ix * kBlockSize];
+
+            candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
+            for (int i = 0; i < kBlockSize; i++)
+            {
+                if (p[i].block_err > 0 && p[i].block_err <= comparator_->BlockErrorLimit())
+                {
+                    candidate_coeffs.push_back(p[i].idx);
+                    candidate_coeff_errors.push_back(p[i].block_err);
+                }
+            }
+        }
+    }
+/*
     // step 2 �Ա�ÿ��block���
     for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
         for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
@@ -623,7 +674,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
             }
         }
     }
-
+*/
     //
     comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
     candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
@@ -666,9 +717,9 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
     }
     std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
 
-    if (input_order.size() > 10)
+    if (input_order.size() > 64)
     {
-        int i = 0;
+        g_compareBlock++;
     }
     coeff_t processed_block[kBlockSize];
     memcpy(processed_block, block, sizeof(processed_block));

From 5e53802a76a1dcd6d4120aa764184c4bdac9f1c3 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Tue, 16 May 2017 09:44:34 +0800
Subject: [PATCH 075/189] Fix BlurEx

---
 clguetzli/clguetzli.cl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 6e354874..f64f9331 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1966,12 +1966,11 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio,
 							  exp(scaler * (-diff + 3) * (-diff + 3)),
 							  exp(scaler * (-diff + 4) * (-diff + 4))};
 	const int xstep = 1; // when sigma=1.1, xstep is 1.
-  const int ystep = xstep;
+	const int ystep = xstep;
 
-  int dxsize = (xsize + xstep - 1) / xstep;
-  int dysize = (ysize + ystep - 1) / ystep;
+	int dxsize = (xsize + xstep - 1) / xstep;
 
-  float *tmp = 0; // TODO:need a tmp and
+	float tmp[8*8] = { 0 };
 	Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp);
 	Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp,
               border_ratio, output);

From e69365c400b8acac05f1fb3382419d48b57a81d0 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 16 May 2017 09:48:50 +0800
Subject: [PATCH 076/189] fix data type of coeff_t

---
 clguetzli/clguetzli.cl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index f64f9331..8f5601a5 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1427,7 +1427,7 @@ typedef struct __IntFloatPair
     float err;
 }IntFloatPair, DCTScoreData, CoeffData;
 
-typedef int16 coeff_t;
+typedef short coeff_t;
 
 typedef struct __IntFloatPairList
 {
@@ -1875,7 +1875,7 @@ __constant static float bias[192] = {
 // return the count of Non-zero item
 int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int size)
 {
-	/* ��һЩ���⣬��ע�͵�
+	int comp_mask = 7;
 	for (int c = 0; c < 3; ++c) {
 		if (!(comp_mask & (1 << c))) continue;
 		for (int k = 1; k < size; ++k) {
@@ -1886,7 +1886,6 @@ int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTSco
 			}
 		}
 	}
-	*/
 /*
     static const double kWeight[3] = { 1.0, 0.22, 0.20 };
 #include "guetzli/order.inc"

From 8d82c8e640261e5d90e7f5db63153b9116bdca9b Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 16 May 2017 10:03:02 +0800
Subject: [PATCH 077/189] modify MakeInputOrder

---
 clguetzli/clguetzli.cl | 40 ++++++----------------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 8f5601a5..6271b6bd 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1474,11 +1474,6 @@ int SortInputOrder(DCTScoreData* input_order, int size)
 		input_order[i + 1].err = tmp.err;
 	}
     return size;
-/*
-    std::sort(input_order.begin(), input_order.end(),
-        [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
-        return a.second < b.second; });
-*/
 }
 
 __constant static float csf[192] = {
@@ -1873,41 +1868,18 @@ __constant static float bias[192] = {
 
 // chrisk todo
 // return the count of Non-zero item
-int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int size)
+int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int block_size)
 {
-	int comp_mask = 7;
+	int size = 0;
 	for (int c = 0; c < 3; ++c) {
-		if (!(comp_mask & (1 << c))) continue;
-		for (int k = 1; k < size; ++k) {
-			int idx = c * size + k;
+		for (int k = 1; k < block_size; ++k) {
+			int idx = c * block_size + k;
 			if (block[idx] != 0) {
 				float score = abs(orig_block[idx]) * csf[idx] + bias[idx];
-				list_push_back(input_order, idx, score);
+				size = list_push_back(input_order, idx, score);
 			}
 		}
 	}
-/*
-    static const double kWeight[3] = { 1.0, 0.22, 0.20 };
-#include "guetzli/order.inc"
-    std::vector<std::pair<int, float> > input_order;
-    for (int c = 0; c < 3; ++c) {
-        if (!(comp_mask & (1 << c))) continue;
-        for (int k = 1; k < kDCTBlockSize; ++k) {
-            int idx = c * kDCTBlockSize + k;
-            if (block[idx] != 0) {
-                float score;
-                if (params_.new_zeroing_model) {
-                    score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
-                }
-                else {
-                    score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) *
-                        kWeight[c] / oldCsf[k]);
-                }
-                input_order.push_back(std::make_pair(idx, score));
-            }
-        }
-    }
-*/
     return SortInputOrder(input_order, size);
 }
 
@@ -2135,7 +2107,7 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
     DCTScoreData input_order_data[kComputeBlockSize];
     CoeffData    output_order_data[kComputeBlockSize];
 
-    int count = MakeInputOrder(block, orig_block, input_order_data, kComputeBlockSize);
+    int count = MakeInputOrder(block, orig_block, input_order_data, kBlockSize);
     IntFloatPairList input_order = { count, input_order_data };
     IntFloatPairList output_order = { 0, output_order_data };
 

From fecac92d6e1db4d2cf7116bd1f52951263d5ead8 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 16 May 2017 10:42:19 +0800
Subject: [PATCH 078/189] Add BlockToImage

---
 clguetzli/clguetzli.cl | 416 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 414 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 6271b6bd..3a242fdb 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1883,10 +1883,422 @@ int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTSco
     return SortInputOrder(input_order, size);
 }
 
+__constant static int kIDCTMatrix[kDCTBlockSize] = {
+	8192,  11363,  10703,   9633,   8192,   6437,   4433,   2260,
+	8192,   9633,   4433,  -2259,  -8192, -11362, -10704,  -6436,
+	8192,   6437,  -4433, -11362,  -8192,   2261,  10704,   9633,
+	8192,   2260, -10703,  -6436,   8192,   9633,  -4433, -11363,
+	8192,  -2260, -10703,   6436,   8192,  -9633,  -4433,  11363,
+	8192,  -6437,  -4433,  11362,  -8192,  -2261,  10704,  -9633,
+	8192,  -9633,   4433,   2259,  -8192,  11362, -10704,   6436,
+	8192, -11363,  10703,  -9633,   8192,  -6437,   4433,  -2260,
+};
+
+// Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]}
+void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
+	int tmp0, tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = kIDCTMatrix[0] * in[0];
+	out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = tmp1;
+
+	tmp0 = in[stride];
+	tmp1 = kIDCTMatrix[1] * tmp0;
+	tmp2 = kIDCTMatrix[9] * tmp0;
+	tmp3 = kIDCTMatrix[17] * tmp0;
+	tmp4 = kIDCTMatrix[25] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+
+	tmp0 = in[2 * stride];
+	tmp1 = kIDCTMatrix[2] * tmp0;
+	tmp2 = kIDCTMatrix[10] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] -= tmp2;
+	out[3] -= tmp1;
+	out[4] -= tmp1;
+	out[5] -= tmp2;
+	out[6] += tmp2;
+	out[7] += tmp1;
+
+	tmp0 = in[3 * stride];
+	tmp1 = kIDCTMatrix[3] * tmp0;
+	tmp2 = kIDCTMatrix[11] * tmp0;
+	tmp3 = kIDCTMatrix[19] * tmp0;
+	tmp4 = kIDCTMatrix[27] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+
+	tmp0 = in[4 * stride];
+	tmp1 = kIDCTMatrix[4] * tmp0;
+	out[0] += tmp1;
+	out[1] -= tmp1;
+	out[2] -= tmp1;
+	out[3] += tmp1;
+	out[4] += tmp1;
+	out[5] -= tmp1;
+	out[6] -= tmp1;
+	out[7] += tmp1;
+
+	tmp0 = in[5 * stride];
+	tmp1 = kIDCTMatrix[5] * tmp0;
+	tmp2 = kIDCTMatrix[13] * tmp0;
+	tmp3 = kIDCTMatrix[21] * tmp0;
+	tmp4 = kIDCTMatrix[29] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+
+	tmp0 = in[6 * stride];
+	tmp1 = kIDCTMatrix[6] * tmp0;
+	tmp2 = kIDCTMatrix[14] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] -= tmp2;
+	out[3] -= tmp1;
+	out[4] -= tmp1;
+	out[5] -= tmp2;
+	out[6] += tmp2;
+	out[7] += tmp1;
+
+	tmp0 = in[7 * stride];
+	tmp1 = kIDCTMatrix[7] * tmp0;
+	tmp2 = kIDCTMatrix[15] * tmp0;
+	tmp3 = kIDCTMatrix[23] * tmp0;
+	tmp4 = kIDCTMatrix[31] * tmp0;
+	out[0] += tmp1;
+	out[1] += tmp2;
+	out[2] += tmp3;
+	out[3] += tmp4;
+	out[4] -= tmp4;
+	out[5] -= tmp3;
+	out[6] -= tmp2;
+	out[7] -= tmp1;
+}
+
+void CoeffToIDCT(coeff_t *block, uchar * out)
+{
+	coeff_t colidcts[kDCTBlockSize];
+	const int kColScale = 11;
+	const int kColRound = 1 << (kColScale - 1);
+	for (int x = 0; x < 8; ++x)
+	{
+		int colbuf[8] = { 0 };
+		Compute1dIDCT(&block[x], 8, colbuf);
+		for (int y = 0; y < 8; ++y)
+		{
+			colidcts[8 * y + x] = (colbuf[y] + kColRound) >> kColScale;
+		}
+	}
+	const int kRowScale = 18;
+	const int kRowRound = 257 << (kRowScale - 1);  // includes offset by 128
+	for (int y = 0; y < 8; ++y)
+	{
+		const int rowidx = 8 * y;
+		int rowbuf[8] = { 0 };
+		Compute1dIDCT(&colidcts[rowidx], 1, rowbuf);
+		for (int x = 0; x < 8; ++x) {
+			out[rowidx + x] = max(0, min(255, (rowbuf[x] + kRowRound) >> kRowScale));
+		}
+	}
+}
+
+void IDCTToImage(uchar *idct, ushort *pixels_)
+{
+	const int block_x = 0;
+	const int block_y = 0;
+	const int width_ = 8;
+	const int height_ = 8;
+
+	for (int iy = 0; iy < 8; ++iy)
+	{
+		for (int ix = 0; ix < 8; ++ix)
+		{
+			int x = 8 * block_x + ix;
+			int y = 8 * block_y + iy;
+			if (x >= width_ || y >= height_) continue;
+			int p = y * width_ + x;
+			pixels_[p] = idct[8 * iy + ix] << 4;
+		}
+	}
+}
+
+void ImageToYUV(ushort *pixels_, uchar *out)
+{
+	const int stride = 3;
+
+	for (int y = 0; y < 8; ++y)
+	{
+		for (int x = 0; x < 8; ++x)
+		{
+			int px = y * 8 + x;
+			*out = (uchar) ((pixels_[px] + 8 - (x & 1)) >> 4);
+			out += stride;
+		}
+	}
+}
+
+__constant static int kCrToRedTable[256] = {
+	-179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164,
+	-163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147,
+	-146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130,
+	-129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114,
+	-112, -111, -109, -108, -107, -105, -104, -102, -101, -100,  -98,  -97,
+	-95,  -94,  -93,  -91,  -90,  -88,  -87,  -86,  -84,  -83,  -81,  -80,
+	-79,  -77,  -76,  -74,  -73,  -72,  -70,  -69,  -67,  -66,  -64,  -63,
+	-62,  -60,  -59,  -57,  -56,  -55,  -53,  -52,  -50,  -49,  -48,  -46,
+	-45,  -43,  -42,  -41,  -39,  -38,  -36,  -35,  -34,  -32,  -31,  -29,
+	-28,  -27,  -25,  -24,  -22,  -21,  -20,  -18,  -17,  -15,  -14,  -13,
+	-11,  -10,   -8,   -7,   -6,   -4,   -3,   -1,    0,    1,    3,    4,
+	6,    7,    8,   10,   11,   13,   14,   15,   17,   18,   20,   21,
+	22,   24,   25,   27,   28,   29,   31,   32,   34,   35,   36,   38,
+	39,   41,   42,   43,   45,   46,   48,   49,   50,   52,   53,   55,
+	56,   57,   59,   60,   62,   63,   64,   66,   67,   69,   70,   72,
+	73,   74,   76,   77,   79,   80,   81,   83,   84,   86,   87,   88,
+	90,   91,   93,   94,   95,   97,   98,  100,  101,  102,  104,  105,
+	107,  108,  109,  111,  112,  114,  115,  116,  118,  119,  121,  122,
+	123,  125,  126,  128,  129,  130,  132,  133,  135,  136,  137,  139,
+	140,  142,  143,  144,  146,  147,  149,  150,  151,  153,  154,  156,
+	157,  158,  160,  161,  163,  164,  165,  167,  168,  170,  171,  172,
+	174,  175,  177,  178
+};
+
+__constant static int kCbToBlueTable[256] = {
+	-227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207,
+	-206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186,
+	-184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165,
+	-163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144,
+	-142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122,
+	-120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101,
+	-99,  -97,  -96,  -94,  -92,  -90,  -89,  -87,  -85,  -83,  -82,  -80,
+	-78,  -76,  -74,  -73,  -71,  -69,  -67,  -66,  -64,  -62,  -60,  -58,
+	-57,  -55,  -53,  -51,  -50,  -48,  -46,  -44,  -43,  -41,  -39,  -37,
+	-35,  -34,  -32,  -30,  -28,  -27,  -25,  -23,  -21,  -19,  -18,  -16,
+	-14,  -12,  -11,   -9,   -7,   -5,   -4,   -2,    0,    2,    4,    5,
+	7,    9,   11,   12,   14,   16,   18,   19,   21,   23,   25,   27,
+	28,   30,   32,   34,   35,   37,   39,   41,   43,   44,   46,   48,
+	50,   51,   53,   55,   57,   58,   60,   62,   64,   66,   67,   69,
+	71,   73,   74,   76,   78,   80,   82,   83,   85,   87,   89,   90,
+	92,   94,   96,   97,   99,  101,  103,  105,  106,  108,  110,  112,
+	113,  115,  117,  119,  120,  122,  124,  126,  128,  129,  131,  133,
+	135,  136,  138,  140,  142,  144,  145,  147,  149,  151,  152,  154,
+	156,  158,  159,  161,  163,  165,  167,  168,  170,  172,  174,  175,
+	177,  179,  181,  183,  184,  186,  188,  190,  191,  193,  195,  197,
+	198,  200,  202,  204,  206,  207,  209,  211,  213,  214,  216,  218,
+	220,  222,  223,  225,
+};
+
+__constant static int kCrToGreenTable[256] = {
+	5990656,  5943854,  5897052,  5850250,  5803448,  5756646,  5709844,  5663042,
+	5616240,  5569438,  5522636,  5475834,  5429032,  5382230,  5335428,  5288626,
+	5241824,  5195022,  5148220,  5101418,  5054616,  5007814,  4961012,  4914210,
+	4867408,  4820606,  4773804,  4727002,  4680200,  4633398,  4586596,  4539794,
+	4492992,  4446190,  4399388,  4352586,  4305784,  4258982,  4212180,  4165378,
+	4118576,  4071774,  4024972,  3978170,  3931368,  3884566,  3837764,  3790962,
+	3744160,  3697358,  3650556,  3603754,  3556952,  3510150,  3463348,  3416546,
+	3369744,  3322942,  3276140,  3229338,  3182536,  3135734,  3088932,  3042130,
+	2995328,  2948526,  2901724,  2854922,  2808120,  2761318,  2714516,  2667714,
+	2620912,  2574110,  2527308,  2480506,  2433704,  2386902,  2340100,  2293298,
+	2246496,  2199694,  2152892,  2106090,  2059288,  2012486,  1965684,  1918882,
+	1872080,  1825278,  1778476,  1731674,  1684872,  1638070,  1591268,  1544466,
+	1497664,  1450862,  1404060,  1357258,  1310456,  1263654,  1216852,  1170050,
+	1123248,  1076446,  1029644,   982842,   936040,   889238,   842436,   795634,
+	748832,   702030,   655228,   608426,   561624,   514822,   468020,   421218,
+	374416,   327614,   280812,   234010,   187208,   140406,    93604,    46802,
+	0,   -46802,   -93604,  -140406,  -187208,  -234010,  -280812,  -327614,
+	-374416,  -421218,  -468020,  -514822,  -561624,  -608426,  -655228,  -702030,
+	-748832,  -795634,  -842436,  -889238,  -936040,  -982842, -1029644, -1076446,
+	-1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862,
+	-1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278,
+	-1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694,
+	-2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110,
+	-2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526,
+	-2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942,
+	-3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358,
+	-3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774,
+	-4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190,
+	-4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606,
+	-4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022,
+	-5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438,
+	-5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854,
+};
+
+__constant static int kCbToGreenTable[256] = {
+	2919680,  2897126,  2874572,  2852018,  2829464,  2806910,  2784356,  2761802,
+	2739248,  2716694,  2694140,  2671586,  2649032,  2626478,  2603924,  2581370,
+	2558816,  2536262,  2513708,  2491154,  2468600,  2446046,  2423492,  2400938,
+	2378384,  2355830,  2333276,  2310722,  2288168,  2265614,  2243060,  2220506,
+	2197952,  2175398,  2152844,  2130290,  2107736,  2085182,  2062628,  2040074,
+	2017520,  1994966,  1972412,  1949858,  1927304,  1904750,  1882196,  1859642,
+	1837088,  1814534,  1791980,  1769426,  1746872,  1724318,  1701764,  1679210,
+	1656656,  1634102,  1611548,  1588994,  1566440,  1543886,  1521332,  1498778,
+	1476224,  1453670,  1431116,  1408562,  1386008,  1363454,  1340900,  1318346,
+	1295792,  1273238,  1250684,  1228130,  1205576,  1183022,  1160468,  1137914,
+	1115360,  1092806,  1070252,  1047698,  1025144,  1002590,   980036,   957482,
+	934928,   912374,   889820,   867266,   844712,   822158,   799604,   777050,
+	754496,   731942,   709388,   686834,   664280,   641726,   619172,   596618,
+	574064,   551510,   528956,   506402,   483848,   461294,   438740,   416186,
+	393632,   371078,   348524,   325970,   303416,   280862,   258308,   235754,
+	213200,   190646,   168092,   145538,   122984,   100430,    77876,    55322,
+	32768,    10214,   -12340,   -34894,   -57448,   -80002,  -102556,  -125110,
+	-147664,  -170218,  -192772,  -215326,  -237880,  -260434,  -282988,  -305542,
+	-328096,  -350650,  -373204,  -395758,  -418312,  -440866,  -463420,  -485974,
+	-508528,  -531082,  -553636,  -576190,  -598744,  -621298,  -643852,  -666406,
+	-688960,  -711514,  -734068,  -756622,  -779176,  -801730,  -824284,  -846838,
+	-869392,  -891946,  -914500,  -937054,  -959608,  -982162, -1004716, -1027270,
+	-1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702,
+	-1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134,
+	-1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566,
+	-1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998,
+	-1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430,
+	-1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862,
+	-2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294,
+	-2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726,
+	-2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158,
+	-2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590,
+};
+
+__constant static uchar kRangeLimitLut[4 * 256] = {
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+void YUVToRGB(uchar *pixelBlock)
+{
+	__constant uchar* kRangeLimit = kRangeLimitLut + 384;
+	for (int i = 0; i < 64; i++)
+	{
+		uchar *pixel = &pixelBlock[i * 3];
+
+		int y = pixel[0];
+		int cb = pixel[1];
+		int cr = pixel[2];
+		pixel[0] = kRangeLimit[y + kCrToRedTable[cr]];
+		pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)];
+		pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]];
+	}
+}
+
 // chrisk todo
-void BlockToImage(coeff_t *candidate_block, float *r, float *g, float *b)
+void BlockToImage(coeff_t *block, float *r, float *g, float *b)
 {
-    // �ο�clguetzli_comparator.cpp : BlockToImage
+	uchar idct[8 * 8 * 3];
+	CoeffToIDCT(&block[0], &idct[0]);
+	CoeffToIDCT(&block[8 * 8], &idct[8 * 8]);
+	CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]);
+
+	ushort pixels[8 * 8 * 3];
+
+	IDCTToImage(&idct[0], &pixels[0]);
+	IDCTToImage(&idct[8 * 8], &pixels[8 * 8]);
+	IDCTToImage(&idct[8 * 8 * 2], &pixels[8 * 8 * 2]);
+
+	uchar yuv[8 * 8 * 3];
+
+	ImageToYUV(&pixels[0], &yuv[0]);
+	ImageToYUV(&pixels[8 * 8], &yuv[1]);
+	ImageToYUV(&pixels[8 * 8 * 2], &yuv[2]);
+
+	YUVToRGB(yuv);
+
+	// Srgb8ToLinearTable begin
+	double lut[256];
+	int i = 0;
+	for (; i < 11; ++i)
+	{
+		lut[i] = i / 12.92;
+	}
+	for (; i < 256; ++i)
+	{
+		lut[i] = 255.0 * pow(((i / 255.0) + 0.055) / 1.055, 2.4);
+	}
+	// Srgb8ToLinearTable end
+
+	for (int i = 0; i < 8 * 8; i++)
+	{
+		r[i] = lut[yuv[3 * i]];
+		g[i] = lut[yuv[3 * i + 1]];
+		b[i] = lut[yuv[3 * i + 2]];
+	}
 }
 
 void Convolution(size_t xsize, size_t ysize,

From e2b38304ef7ac89d9d57b6f6d6a571b40a983c38 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 16 May 2017 11:43:22 +0800
Subject: [PATCH 079/189] Add MaskHighIntensityChangeBlock

---
 clguetzli/clguetzli.cl | 51 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 3a242fdb..810e00eb 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -2397,6 +2397,57 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     float *c1_x, float *c1_y, float *c1_b,
     int xsize, int ysize)
 {
+	for (int x = 0; x < xsize; ++x)
+	{
+		for (int y = 0; y < ysize; ++y)
+		{
+			size_t ix = y * xsize + x;
+			const double ave[3] = {
+				(c0_x[ix] + c1_x[ix]) * 0.5,
+				(c0_y[ix] + c1_y[ix]) * 0.5,
+				(c0_b[ix] + c1_b[ix]) * 0.5,
+			};
+			double sqr_max_diff = -1;
+			{
+				int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+				int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+				for (int dir = 0; dir < 4; ++dir) {
+					if (border[dir])
+					{
+						continue;
+					}
+					const int ix2 = ix + offset[dir];
+					double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+					diff *= diff;
+					if (sqr_max_diff < diff)
+					{
+						sqr_max_diff = diff;
+					}
+				}
+			}
+			const double kReductionX = 275.19165240059317;
+			const double kReductionY = 18599.41286306991;
+			const double kReductionZ = 410.8995306951065;
+			const double kChromaBalance = 106.95800948271017;
+			double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+			const double mix[3] = {
+				chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+				kReductionY / (sqr_max_diff + kReductionY),
+				chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+			};
+			// Interpolate lineraly between the average color and the actual
+			// color -- to reduce the importance of this pixel.
+			xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+			xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+
+			xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+			xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+
+			xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+			xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
+		}
+	}
 }
 
 typedef union ocl_channels_t

From b76def649158e1334e5fdf32fb41e413ac628dd9 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 16 May 2017 16:06:11 +0800
Subject: [PATCH 080/189] =?UTF-8?q?SelectFrequencyMaskingBatch=20=E8=AE=A1?=
 =?UTF-8?q?=E7=AE=97=E6=B5=81=E7=A8=8B=E4=BF=AE=E6=AD=A3=EF=BC=8C=E7=BB=88?=
 =?UTF-8?q?=E4=BA=8E=E5=8F=AF=E4=BB=A5=E6=AD=A3=E5=B8=B8=E8=B7=91=E8=B5=B7?=
 =?UTF-8?q?=E6=9D=A5=E4=BA=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl             | 26 ++++++------
 clguetzli/clguetzli.cpp            | 20 +++++----
 clguetzli/clguetzli.h              |  2 +-
 clguetzli/clguetzli_comparator.cpp |  2 +-
 clguetzli/ocl.cpp                  | 16 ++++++-
 clguetzli/ocl.h                    |  2 +-
 guetzli/butteraugli_comparator.cc  |  6 ---
 guetzli/butteraugli_comparator.h   |  3 --
 guetzli/guetzli.cc                 |  4 --
 guetzli/processor.cc               | 67 ++++++++++++++++--------------
 10 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 810e00eb..e4565a90 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1868,7 +1868,7 @@ __constant static float bias[192] = {
 
 // chrisk todo
 // return the count of Non-zero item
-int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTScoreData *input_order, int block_size)
+int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
 {
 	int size = 0;
 	for (int c = 0; c < 3; ++c) {
@@ -1880,7 +1880,7 @@ int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, DCTSco
 			}
 		}
 	}
-    return SortInputOrder(input_order, size);
+    return SortInputOrder(input_order->pData, size);
 }
 
 __constant static int kIDCTMatrix[kDCTBlockSize] = {
@@ -2558,6 +2558,7 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
                                          __global coeff_t *block_list/*in*/,
                                          __global float *orig_image/*in*/,
                                          __global float *mask_scale/*in*/,
+                                         float BlockErrorLimit,
                                          __global CoeffData *output_order_list/*out*/)
 {
     int block_idx = get_global_id(0);
@@ -2570,10 +2571,11 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
     DCTScoreData input_order_data[kComputeBlockSize];
     CoeffData    output_order_data[kComputeBlockSize];
 
-    int count = MakeInputOrder(block, orig_block, input_order_data, kBlockSize);
-    IntFloatPairList input_order = { count, input_order_data };
+    IntFloatPairList input_order  = { 0, input_order_data };
     IntFloatPairList output_order = { 0, output_order_data };
 
+    int count = MakeInputOrder(block, orig_block, &input_order, kBlockSize);
+
     coeff_t processed_block[kComputeBlockSize];
     for (int i = 0; i < kComputeBlockSize; i++) {
         processed_block[i] = block[i];
@@ -2617,17 +2619,15 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
 
     __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
 
-    for (int i = 0; i < kComputeBlockSize; i++)
+    int out_count = 0;
+    for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
     {
-        if (i >= output_order.size)
-        {
-            output_block[i].idx = 0;
-            output_block[i].err = 0;
-        }
-        else
+        // ���˽ϴ��err���ⲿ�ֽ����˼���û������
+        if (output_order.pData[i].err <= BlockErrorLimit)
         {
-            output_block[i].idx = output_order.pData[i].idx;
-            output_block[i].err = output_order.pData[i].err;
+            output_block[out_count].idx = output_order.pData[i].idx;
+            output_block[out_count].err = output_order.pData[i].err;
+            out_count++;
         }
     }
 }
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index b8c8ad2a..4c8b7b25 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1194,9 +1194,9 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	clReleaseMemObject(mem_result);
 }
 
-void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list,
-                              float *orig_iamge, float* mask_scale, CoeffData *output_order_list,
-                              int size)
+void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch,
+                              float *orig_image, float* mask_scale, CoeffData *output_order_batch,
+                              int size, float BlockErrorLimit)
 {
     using namespace guetzli;
 
@@ -1205,18 +1205,20 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coef
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
 
-    cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count);
-    cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count);
-    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count);
-    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size);
+    cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, orig_block_batch);
+    cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, block_batch);
+    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count, orig_image);
+    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size, mask_scale);
     cl_mem mem_output_order_list = ocl.allocMem(sizeof(CoeffData) * item_count);
+    cl_float clBlockErrorLimit = BlockErrorLimit;
 
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_block_list);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_block_list);
     clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_image);
     clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mask_scale);
-    clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_output_order_list);
+    clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit);
+    clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_list);
 
     size_t globalWorkSize[1] = { size };
     err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -1232,7 +1234,7 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coef
 
     CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_list, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
     err = clFinish(ocl.commandQueue);
-    memcpy(output_order_list, result, sizeof(CoeffData) * item_count);
+    memcpy(output_order_batch, result, sizeof(CoeffData) * item_count);
 
     clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_list, result, sizeof(CoeffData) * item_count, NULL, NULL);
     clFinish(ocl.commandQueue);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 178dce27..83bbae09 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -19,7 +19,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
-void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_list, guetzli::coeff_t *block_list, float *orig_iamge, float* mask_scale, CoeffData *output_order_list, int size);
+void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, CoeffData *output_order_batch, int size, float BlockErrorLimit);
 
 void clMask(const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2,
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 3babe180..eb442e21 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -368,7 +368,7 @@ namespace guetzli
         block_y_ = block_y;
         factor_x_ = factor_x;
         factor_y_ = factor_y;
-        return;
+
 		ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
 	}
 
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 05f5470f..2f114e02 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -189,7 +189,7 @@ void* ocl_args_d_t::allocC(size_t s)
 	return outputC;
 }
 
-cl_mem ocl_args_d_t::allocMem(size_t s)
+cl_mem ocl_args_d_t::allocMem(size_t s, void *init)
 {
 	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
 	cl_int err = 0;
@@ -198,6 +198,20 @@ cl_mem ocl_args_d_t::allocMem(size_t s)
 	{
 		LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err));
 	}
+    if (mem && init)
+    {
+        err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL);
+        if (CL_SUCCESS != err)
+        {
+            LogError("Error: allocMem() clEnqueueWriteBuffer return %s.\n", TranslateOpenCLError(err));
+        }
+        err = clFinish(this->commandQueue);
+        if (CL_SUCCESS != err)
+        {
+            LogError("Error: allocMem() clFinish return %s.\n", TranslateOpenCLError(err));
+        }
+    }
+
 	return mem;
 }
 
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index ae5ceeeb..59b4582d 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -95,7 +95,7 @@ struct ocl_args_d_t
 	void* allocB(size_t s);
 	void* allocC(size_t s);
 
-	cl_mem allocMem(size_t s);
+	cl_mem allocMem(size_t s, void *init = NULL);
 	ocl_channels allocMemChannels(size_t s);
     void releaseMemChannels(ocl_channels rgb);
 
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index 9034d68e..1748b80d 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -22,9 +22,6 @@
 #include "guetzli/gamma_correct.h"
 #include "guetzli/score.h"
 
-int g_switchBlock = 0;
-int g_compareBlock = 0;
-
 namespace guetzli {
 
 ButteraugliComparator::ButteraugliComparator(const int width, const int height,
@@ -97,8 +94,6 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y,
       ::butteraugli::OpsinDynamicsImage(8, 8, per_block_pregamma_[bx]);
     }
   }
-
-  g_switchBlock++;
 }
 
 double ButteraugliComparator::CompareBlock(const OutputImage& img,
@@ -114,7 +109,6 @@ double ButteraugliComparator::CompareBlock(const OutputImage& img,
   std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
   img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c);
   ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
-  g_compareBlock++;
 
   std::vector<std::vector<float> > rgb0 = rgb0_c;
   std::vector<std::vector<float> > rgb1 = rgb1_c;
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index 0136f2bb..bc247afe 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -26,9 +26,6 @@
 #include "guetzli/output_image.h"
 #include "guetzli/stats.h"
 
-extern int g_switchBlock;
-extern int g_compareBlock;
-
 namespace guetzli {
 
 constexpr int kButteraugliStep = 3;
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 32103a74..3f91cddd 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -233,9 +233,6 @@ void Usage() {
 
 }  // namespace
 
-extern int g_switchBlock;
-extern int g_compareBlock;
-
 int main(int argc, char** argv) {
   std::set_terminate(TerminateHandler);
 
@@ -337,6 +334,5 @@ int main(int argc, char** argv) {
 
   WriteFileOrDie(argv[opt_idx + 1], out_data);
 
-  fprintf(stderr, "%d %d", g_switchBlock, g_compareBlock);
   return 0;
 }
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index eaf9f75b..11abfa4c 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -603,30 +603,39 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
         }
     }
 
-    std::vector<CoeffData> output_order(num_blocks * kBlockSize);
+    // step 2 ��������block��ϵ��ƫ��
+    std::vector<CoeffData> output_order_gpu;
+    std::vector<CoeffData> output_order_cpu;
+    CoeffData * output_order = NULL;
     ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
 
-    if (g_useOpenCL)
+    if (g_useOpenCL || g_checkOpenCL)
     {
+        output_order_gpu.resize(num_blocks * kBlockSize);
+        output_order = output_order_gpu.data();
         clComputeBlockZeroingOrder(orig_block_batch.data(),
                                     block_batch.data(),
                                     comp->imgOpsinDynamicsBlockList.data(),
                                     comp->imgMaskXyzScaleBlockList.data(),
-                                    output_order.data(),
-                                    num_blocks);
+                                    output_order_gpu.data(),
+                                    num_blocks,
+                                    comparator_->BlockErrorLimit());
+
+
     }
-    else
+    if (!g_useOpenCL || g_checkOpenCL)
     {
+        output_order_cpu.resize(num_blocks * kBlockSize);
+        output_order = output_order_cpu.data();
         for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
             for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
                 coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
                 coeff_t *block = &block_batch[block_ix * kBlockSize];
 
                 std::vector<CoeffData> block_order;
-
                 ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
 
-                CoeffData * p = &output_order[block_ix * kBlockSize];
+                CoeffData * p = &output_order_cpu[block_ix * kBlockSize];
                 for (int i = 0; i < block_order.size(); i++)
                 {
                     p[i].idx = block_order[i].idx;
@@ -635,6 +644,23 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
             }
         }
     }
+    if (g_checkOpenCL)
+    {
+        int count = 0;
+        int check_size = output_order_gpu.size();
+        for (int i = 0; i < check_size; i++)
+        {
+            if (output_order_cpu[i].idx != output_order_gpu[i].idx ||
+                fabs(output_order_cpu[i].block_err - output_order_gpu[i].block_err) > 0.001)
+            {
+                count++;
+            }
+        }
+        if (count > 0)
+        {
+            LogError("CHK %s(%d) %d:%d\r\n", __FUNCTION__, __LINE__, count, check_size);
+        }
+    }
 
     std::vector<int> candidate_coeff_offsets(num_blocks + 1);
     std::vector<uint8_t> candidate_coeffs;
@@ -655,28 +681,9 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
             }
         }
     }
-/*
-    // step 2 �Ա�ÿ��block���
-    for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
-        for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-            coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
-            coeff_t *block = &block_batch[block_ix * kBlockSize];
 
-            std::vector<CoeffData> block_order;
-
-           ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
-
-            // ���´�����Ȼû��batch���������ȼ�������������
-            candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
-            for (size_t i = 0; i < block_order.size(); ++i) {
-                candidate_coeffs.push_back(block_order[i].idx);
-                candidate_coeff_errors.push_back(block_order[i].block_err);
-            }
-        }
-    }
-*/
     //
-    comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
+    comparator_->FinishBlockComparisons();
     candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
 
     SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early,
@@ -717,10 +724,6 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
     }
     std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
 
-    if (input_order.size() > 64)
-    {
-        g_compareBlock++;
-    }
     coeff_t processed_block[kBlockSize];
     memcpy(processed_block, block, sizeof(processed_block));
 
@@ -1122,6 +1125,8 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
       SelectFrequencyMasking(jpg, &img, 1, ymul, false);
       SelectFrequencyMasking(jpg, &img, 6, 1.0, true);
+//      SelectFrequencyMaskingBatch(jpg, &img, ymul, false);
+//      SelectFrequencyMaskingBatch(jpg, &img, 1.0, true);
     }
   }
 

From caa4fbbcbb9a6d7822483c28ea57eb88e1bd29af Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 17 May 2017 03:08:51 +0800
Subject: [PATCH 081/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clbutter_comparator.cpp             |   1 +
 clguetzli/clbutter_comparator.h               |   2 +-
 clguetzli/clguetzli.cpp                       |   2 +-
 clguetzli/clguetzli.h                         |   8 +-
 clguetzli/clguetzli_comparator.cpp            | 343 +++++-------------
 clguetzli/clguetzli_comparator.h              |   5 +-
 guetzli/butteraugli_comparator.cc             |   2 +-
 guetzli/butteraugli_comparator.h              |   3 +-
 guetzli/comparator.h                          |   2 +-
 guetzli/guetzli.cc                            |   5 +-
 guetzli/processor.cc                          |  79 ++--
 guetzli/processor.h                           |   5 +
 .../butteraugli/butteraugli/butteraugli.cc    |  55 +--
 13 files changed, 161 insertions(+), 351 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 1da9a2cd..734e2c33 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -96,6 +96,7 @@ namespace butteraugli
 
         if (g_checkOpenCL)
         {
+			temp.resize(res_xsize_ * res_ysize_);
             tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
                 mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
                 block_diff_dc.data(),
diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h
index eb2e4e32..19ca163f 100644
--- a/clguetzli/clbutter_comparator.h
+++ b/clguetzli/clbutter_comparator.h
@@ -62,7 +62,7 @@ namespace butteraugli {
         size_t len, size_t offset,
         const float* __restrict__ multipliers,
         const float* __restrict__ inp,
-        float border_ratio,
+        double border_ratio,
         float* __restrict__ result);
     void _Blur(size_t xsize, size_t ysize, float* channel, double sigma,
         double border_ratio);
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 4c8b7b25..d2f01f3e 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1195,7 +1195,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 }
 
 void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch,
-                              float *orig_image, float* mask_scale, CoeffData *output_order_batch,
+                              float *orig_image, float* mask_scale, guetzli::CoeffData *output_order_batch,
                               int size, float BlockErrorLimit)
 {
     using namespace guetzli;
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 83bbae09..457de4a0 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,16 +1,12 @@
 #pragma once
 #include "CL\cl.h"
 #include "guetzli\jpeg_data.h"
+#include "guetzli\processor.h"
 #include "ocl.h"
 
 extern bool g_useOpenCL;
 extern bool g_checkOpenCL;
 
-struct CoeffData {
-    int idx;
-    float block_err;
-};
-
 void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
@@ -19,7 +15,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
-void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, CoeffData *output_order_batch, int size, float BlockErrorLimit);
+void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, guetzli::CoeffData *output_order_batch, int size, float BlockErrorLimit);
 
 void clMask(const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2,
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index eb442e21..22b1965f 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -2,222 +2,19 @@
 #include <algorithm>
 #include "clguetzli_comparator.h"
 #include "guetzli\idct.h"
+#include "guetzli\color_transform.h"
+#include "guetzli\gamma_correct.h"
+#include "clguetzli\ocl.h"
+#include "clguetzli\clguetzli.h"
 
+using namespace guetzli;
 
-typedef int16_t coeff_t;
-
-const double* NewSrgb8ToLinearTable() {
-	double* table = new double[256];
-	int i = 0;
-	for (; i < 11; ++i) {
-		table[i] = i / 12.92;
-	}
-	for (; i < 256; ++i) {
-		table[i] = 255.0 * std::pow(((i / 255.0) + 0.055) / 1.055, 2.4);
-	}
-	return table;
-}
-
-const double* Srgb8ToLinearTable() {
-	static const double* const kSrgb8ToLinearTable = NewSrgb8ToLinearTable();
-	return kSrgb8ToLinearTable;
-}
-
-static const int kCrToRedTable[256] = {
-	-179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164,
-	-163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147,
-	-146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130,
-	-129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114,
-	-112, -111, -109, -108, -107, -105, -104, -102, -101, -100,  -98,  -97,
-	-95,  -94,  -93,  -91,  -90,  -88,  -87,  -86,  -84,  -83,  -81,  -80,
-	-79,  -77,  -76,  -74,  -73,  -72,  -70,  -69,  -67,  -66,  -64,  -63,
-	-62,  -60,  -59,  -57,  -56,  -55,  -53,  -52,  -50,  -49,  -48,  -46,
-	-45,  -43,  -42,  -41,  -39,  -38,  -36,  -35,  -34,  -32,  -31,  -29,
-	-28,  -27,  -25,  -24,  -22,  -21,  -20,  -18,  -17,  -15,  -14,  -13,
-	-11,  -10,   -8,   -7,   -6,   -4,   -3,   -1,    0,    1,    3,    4,
-	6,    7,    8,   10,   11,   13,   14,   15,   17,   18,   20,   21,
-	22,   24,   25,   27,   28,   29,   31,   32,   34,   35,   36,   38,
-	39,   41,   42,   43,   45,   46,   48,   49,   50,   52,   53,   55,
-	56,   57,   59,   60,   62,   63,   64,   66,   67,   69,   70,   72,
-	73,   74,   76,   77,   79,   80,   81,   83,   84,   86,   87,   88,
-	90,   91,   93,   94,   95,   97,   98,  100,  101,  102,  104,  105,
-	107,  108,  109,  111,  112,  114,  115,  116,  118,  119,  121,  122,
-	123,  125,  126,  128,  129,  130,  132,  133,  135,  136,  137,  139,
-	140,  142,  143,  144,  146,  147,  149,  150,  151,  153,  154,  156,
-	157,  158,  160,  161,  163,  164,  165,  167,  168,  170,  171,  172,
-	174,  175,  177,  178
-};
-
-static const int kCbToBlueTable[256] = {
-	-227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207,
-	-206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186,
-	-184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165,
-	-163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144,
-	-142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122,
-	-120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101,
-	-99,  -97,  -96,  -94,  -92,  -90,  -89,  -87,  -85,  -83,  -82,  -80,
-	-78,  -76,  -74,  -73,  -71,  -69,  -67,  -66,  -64,  -62,  -60,  -58,
-	-57,  -55,  -53,  -51,  -50,  -48,  -46,  -44,  -43,  -41,  -39,  -37,
-	-35,  -34,  -32,  -30,  -28,  -27,  -25,  -23,  -21,  -19,  -18,  -16,
-	-14,  -12,  -11,   -9,   -7,   -5,   -4,   -2,    0,    2,    4,    5,
-	7,    9,   11,   12,   14,   16,   18,   19,   21,   23,   25,   27,
-	28,   30,   32,   34,   35,   37,   39,   41,   43,   44,   46,   48,
-	50,   51,   53,   55,   57,   58,   60,   62,   64,   66,   67,   69,
-	71,   73,   74,   76,   78,   80,   82,   83,   85,   87,   89,   90,
-	92,   94,   96,   97,   99,  101,  103,  105,  106,  108,  110,  112,
-	113,  115,  117,  119,  120,  122,  124,  126,  128,  129,  131,  133,
-	135,  136,  138,  140,  142,  144,  145,  147,  149,  151,  152,  154,
-	156,  158,  159,  161,  163,  165,  167,  168,  170,  172,  174,  175,
-	177,  179,  181,  183,  184,  186,  188,  190,  191,  193,  195,  197,
-	198,  200,  202,  204,  206,  207,  209,  211,  213,  214,  216,  218,
-	220,  222,  223,  225,
-};
-
-static const int kCrToGreenTable[256] = {
-	5990656,  5943854,  5897052,  5850250,  5803448,  5756646,  5709844,  5663042,
-	5616240,  5569438,  5522636,  5475834,  5429032,  5382230,  5335428,  5288626,
-	5241824,  5195022,  5148220,  5101418,  5054616,  5007814,  4961012,  4914210,
-	4867408,  4820606,  4773804,  4727002,  4680200,  4633398,  4586596,  4539794,
-	4492992,  4446190,  4399388,  4352586,  4305784,  4258982,  4212180,  4165378,
-	4118576,  4071774,  4024972,  3978170,  3931368,  3884566,  3837764,  3790962,
-	3744160,  3697358,  3650556,  3603754,  3556952,  3510150,  3463348,  3416546,
-	3369744,  3322942,  3276140,  3229338,  3182536,  3135734,  3088932,  3042130,
-	2995328,  2948526,  2901724,  2854922,  2808120,  2761318,  2714516,  2667714,
-	2620912,  2574110,  2527308,  2480506,  2433704,  2386902,  2340100,  2293298,
-	2246496,  2199694,  2152892,  2106090,  2059288,  2012486,  1965684,  1918882,
-	1872080,  1825278,  1778476,  1731674,  1684872,  1638070,  1591268,  1544466,
-	1497664,  1450862,  1404060,  1357258,  1310456,  1263654,  1216852,  1170050,
-	1123248,  1076446,  1029644,   982842,   936040,   889238,   842436,   795634,
-	748832,   702030,   655228,   608426,   561624,   514822,   468020,   421218,
-	374416,   327614,   280812,   234010,   187208,   140406,    93604,    46802,
-	0,   -46802,   -93604,  -140406,  -187208,  -234010,  -280812,  -327614,
-	-374416,  -421218,  -468020,  -514822,  -561624,  -608426,  -655228,  -702030,
-	-748832,  -795634,  -842436,  -889238,  -936040,  -982842, -1029644, -1076446,
-	-1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862,
-	-1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278,
-	-1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694,
-	-2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110,
-	-2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526,
-	-2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942,
-	-3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358,
-	-3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774,
-	-4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190,
-	-4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606,
-	-4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022,
-	-5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438,
-	-5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854,
-};
-
-static const int kCbToGreenTable[256] = {
-	2919680,  2897126,  2874572,  2852018,  2829464,  2806910,  2784356,  2761802,
-	2739248,  2716694,  2694140,  2671586,  2649032,  2626478,  2603924,  2581370,
-	2558816,  2536262,  2513708,  2491154,  2468600,  2446046,  2423492,  2400938,
-	2378384,  2355830,  2333276,  2310722,  2288168,  2265614,  2243060,  2220506,
-	2197952,  2175398,  2152844,  2130290,  2107736,  2085182,  2062628,  2040074,
-	2017520,  1994966,  1972412,  1949858,  1927304,  1904750,  1882196,  1859642,
-	1837088,  1814534,  1791980,  1769426,  1746872,  1724318,  1701764,  1679210,
-	1656656,  1634102,  1611548,  1588994,  1566440,  1543886,  1521332,  1498778,
-	1476224,  1453670,  1431116,  1408562,  1386008,  1363454,  1340900,  1318346,
-	1295792,  1273238,  1250684,  1228130,  1205576,  1183022,  1160468,  1137914,
-	1115360,  1092806,  1070252,  1047698,  1025144,  1002590,   980036,   957482,
-	934928,   912374,   889820,   867266,   844712,   822158,   799604,   777050,
-	754496,   731942,   709388,   686834,   664280,   641726,   619172,   596618,
-	574064,   551510,   528956,   506402,   483848,   461294,   438740,   416186,
-	393632,   371078,   348524,   325970,   303416,   280862,   258308,   235754,
-	213200,   190646,   168092,   145538,   122984,   100430,    77876,    55322,
-	32768,    10214,   -12340,   -34894,   -57448,   -80002,  -102556,  -125110,
-	-147664,  -170218,  -192772,  -215326,  -237880,  -260434,  -282988,  -305542,
-	-328096,  -350650,  -373204,  -395758,  -418312,  -440866,  -463420,  -485974,
-	-508528,  -531082,  -553636,  -576190,  -598744,  -621298,  -643852,  -666406,
-	-688960,  -711514,  -734068,  -756622,  -779176,  -801730,  -824284,  -846838,
-	-869392,  -891946,  -914500,  -937054,  -959608,  -982162, -1004716, -1027270,
-	-1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702,
-	-1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134,
-	-1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566,
-	-1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998,
-	-1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430,
-	-1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862,
-	-2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294,
-	-2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726,
-	-2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158,
-	-2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590,
-};
-
-static const uint8_t kRangeLimitLut[4 * 256] = {
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-	0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
-	16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-	32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-	48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
-	64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-	80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
-	96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
-	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
-	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-};
-
-static const uint8_t* kRangeLimit = kRangeLimitLut + 384;
-
-void CoeffToIDCT(coeff_t *block, uint8_t *idct)
+void CoeffToIDCT(const coeff_t *block, uint8_t *idct)
 {
 	guetzli::ComputeBlockIDCT(block, idct);
 }
 
-void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_)
+void IDCTToPixel(const uint8_t idct[8 * 8], uint16_t *pixels_)
 {
 	const int block_x = 0;
 	const int block_y = 0;
@@ -236,7 +33,7 @@ void IDCTToImage(const uint8_t idct[8 * 8], uint16_t *pixels_)
 }
 
 // out = [YUVYUV....YUVYUV]
-void ImageToYUV(uint16_t *pixels_, uint8_t *out)
+void PixelToYUV(uint16_t *pixels_, uint8_t *out)
 {
 	const int stride = 3;
 
@@ -266,34 +63,55 @@ void YUVToRGB(uint8_t* pixelBlock)
 }
 
 // block = [R....R][G....G][B.....]
-void BlockToImage(coeff_t *block, float* r, float* g, float* b)
+void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside_x, int inside_y)
 {
-	uint8_t idct[8 * 8 * 3];
-	CoeffToIDCT(&block[0], &idct[0]);
-	CoeffToIDCT(&block[8 * 8], &idct[8 * 8]);
-	CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]);
+	uint8_t idct[3][8 * 8];
+	CoeffToIDCT(&block[0], idct[0]);
+	CoeffToIDCT(&block[8 * 8], idct[1]);
+	CoeffToIDCT(&block[8 * 8 * 2], idct[2]);
 
-	uint16_t pixels[8 * 8 * 3];
+    uint16_t pixels[3][8 * 8];
 
-	IDCTToImage(&idct[0], &pixels[0]);
-	IDCTToImage(&idct[8*8], &pixels[8*8]);
-	IDCTToImage(&idct[8*8*2], &pixels[8*8*2]);
+	IDCTToPixel(idct[0], pixels[0]);
+	IDCTToPixel(idct[1], pixels[1]);
+	IDCTToPixel(idct[2], pixels[2]);
 
 	uint8_t yuv[8 * 8 * 3];
 
-	ImageToYUV(&pixels[0], &yuv[0]);
-	ImageToYUV(&pixels[8*8], &yuv[1]);
-	ImageToYUV(&pixels[8*8*2], &yuv[2]);
+	PixelToYUV(pixels[0], &yuv[0]);
+	PixelToYUV(pixels[1], &yuv[1]);
+	PixelToYUV(pixels[2], &yuv[2]);
 
     YUVToRGB(yuv);
 
 	const double* lut = Srgb8ToLinearTable();
+
 	for (int i = 0; i < 8 * 8; i++)
 	{
 		r[i] = lut[yuv[3 * i]];
 		g[i] = lut[yuv[3 * i + 1]];
 		b[i] = lut[yuv[3 * i + 2]];
 	}
+    for (int y = 0; y < inside_y; y++)
+    {
+        for (int x = inside_x; x < 8; x++)
+        {
+            int idx = y * 8 + (inside_x - 1);
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+    for (int y = inside_y; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = (inside_y - 1) * 8 + x;
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
 }
 
 namespace guetzli
@@ -351,9 +169,7 @@ namespace guetzli
                 imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin];
             }
         }
-
-
-	}
+    }
 
     void ButteraugliComparatorEx::FinishBlockComparisons() {
         ButteraugliComparator::FinishBlockComparisons();
@@ -362,35 +178,75 @@ namespace guetzli
         imgMaskXyzScaleBlockList.clear();
     }
 
-	void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
-	{
+    void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
+    {
         block_x_ = block_x;
         block_y_ = block_y;
         factor_x_ = factor_x;
         factor_y_ = factor_y;
 
-		ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
-	}
+        ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
+    }
 
-	double ButteraugliComparatorEx::CompareBlockEx(coeff_t* candidate_block)
-	{
+    double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const
+    {
+        double err = CompareBlockEx(img, off_x, off_y, candidate_block);
+        if (g_checkOpenCL)
+        {
+            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block);
+            if (err1 != err)
+            {
+                LogError("Error: CompareBlock misstake.\n");
+            }
+        }
+
+        return err;
+    }
+
+    double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const
+    {
         int block_ix = getCurrentBlockIdx();
 
-        float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+        const float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
 
-        // ����ڴ濽�����Ż�������������
+        // �����ԭʼͼ��
         std::vector< std::vector<float> > rgb0_c;
         rgb0_c.resize(3);
         for (int i = 0; i < 3; i++)
         {
             rgb0_c[i].resize(kDCTBlockSize);
-            memcpy(rgb0_c[i].data(), block_opsin + i*kDCTBlockSize, kDCTBlockSize * sizeof(float));
+            memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float));
         }
 
-        //
-		std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-		BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data());
+        // img��ȫ���Ż����ͼ������ͨ��coeff_t���ݷ������rgb
+        int border_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8;
+        int border_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8;
+        std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+        BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), border_x, border_y);
+/*
+        {
+            // ���ܻ������⣬������һ��У��
+            int block_x = block_x_ * factor_x_ + off_x;
+            int block_y = block_y_ * factor_y_ + off_y;
+            int xmin = 8 * block_x;
+            int ymin = 8 * block_y;
+
+            std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
+            img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
 
+            for (int i = 0; i < 3; i++)
+            {
+                for (int k = 0; k < 64; k++)
+                {
+                    if (fabs(rgb1_c[i][k] - rgb1_c2[i][k]) > 0.001)
+                    {
+                        LogError("Error: CompareBlock misstake.\n");
+                    }
+                }
+            }
+        }
+*/
+        // �����Ǽ��㹤��
         ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c);
 		::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
 
@@ -424,15 +280,10 @@ namespace guetzli
 	}
 
 
-    int ButteraugliComparatorEx::getCurrentBlockIdx(void)
+    int ButteraugliComparatorEx::getCurrentBlockIdx(void) const
     {
-        const int width = width_;
-        const int height = height_;
-        const int factor_x = 1;
-        const int factor_y = 1;
-
-        const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
-        const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+        const int block_width = (width_ + 8 * factor_x_ - 1) / (8 * factor_x_);
+        const int block_height = (height_ + 8 * factor_y_ - 1) / (8 * factor_y_);
 
         return block_y_ * block_width + block_x_;
     }
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
index 97f23fb9..7f3a768c 100644
--- a/clguetzli/clguetzli_comparator.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -15,9 +15,10 @@ namespace guetzli {
         void FinishBlockComparisons() override;
 		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
 
-		double CompareBlockEx(coeff_t* candidate_block);
+        double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const override;
+		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const;
     private:
-        int getCurrentBlockIdx(void);
+        int getCurrentBlockIdx(void) const;
 	public:
 		std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index 1748b80d..accb905f 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -97,7 +97,7 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y,
 }
 
 double ButteraugliComparator::CompareBlock(const OutputImage& img,
-                                           int off_x, int off_y) const {
+                                           int off_x, int off_y, const coeff_t* candidate_block) const {
   int block_x = block_x_ * factor_x_ + off_x;
   int block_y = block_y_ * factor_y_ + off_y;
   int xmin = 8 * block_x;
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index bc247afe..572a9689 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -30,7 +30,6 @@ namespace guetzli {
 
 constexpr int kButteraugliStep = 3;
 
-
 class ButteraugliComparator : public Comparator {
  public:
   ButteraugliComparator(const int width, const int height,
@@ -46,7 +45,7 @@ class ButteraugliComparator : public Comparator {
                    int factor_x, int factor_y) override;
 
   double CompareBlock(const OutputImage& img,
-                      int off_x, int off_y) const override;
+                      int off_x, int off_y, const coeff_t* candidate_block) const override;
 
   double ScoreOutputSize(int size) const override;
 
diff --git a/guetzli/comparator.h b/guetzli/comparator.h
index 00c56977..db76ac77 100644
--- a/guetzli/comparator.h
+++ b/guetzli/comparator.h
@@ -51,7 +51,7 @@ class Comparator {
   // the resulting per-block distance. The interpretation of the returned
   // distance depends on the comparator used.
   virtual double CompareBlock(const OutputImage& img,
-                              int off_x, int off_y) const = 0;
+                              int off_x, int off_y, const coeff_t* candidate_block) const = 0;
 
   // Returns the combined score of the output image in the last Compare() call
   // (or the baseline image, if Compare() was not called yet), based on output
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 3f91cddd..5982bc1c 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -226,8 +226,8 @@ void Usage() {
       "                 Default value is %d.\n"
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
-      "  --nomemlimit - Do not limit memory usage.\n"
-	  "  --opencl     - Use OpenCL\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
+	  "  --opencl     - Use OpenCL\n"
+      "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
 
@@ -333,6 +333,5 @@ int main(int argc, char** argv) {
   }
 
   WriteFileOrDie(argv[opt_idx + 1], out_data);
-
   return 0;
 }
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 11abfa4c..e3bd4be4 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -22,8 +22,6 @@
 #include <vector>
 
 #include "guetzli/butteraugli_comparator.h"
-#include "clguetzli\clguetzli_comparator.h"
-#include "clguetzli\clguetzli.h"
 #include "guetzli/comparator.h"
 #include "guetzli/debug_print.h"
 #include "guetzli/fast_log.h"
@@ -33,18 +31,15 @@
 #include "guetzli/jpeg_data_writer.h"
 #include "guetzli/output_image.h"
 #include "guetzli/quantize.h"
+#include "clguetzli\clguetzli_comparator.h"
+#include "clguetzli\clguetzli.h"
 
 namespace guetzli {
 
 namespace {
 
 static const size_t kBlockSize = 3 * kDCTBlockSize;
-/*
-struct CoeffData {
-  int idx;
-  float block_err;
-};
-*/
+
 struct QuantData {
   int q[3][kDCTBlockSize];
   size_t jpg_size;
@@ -381,14 +376,12 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
 }
 
 
-
 // REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<<c)) == 0.
 void Processor::ComputeBlockZeroingOrder(
     const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
     const int block_x, const int block_y, const int factor_x,
     const int factor_y, const uint8_t comp_mask, OutputImage* img,
     std::vector<CoeffData>* output_order) {
-
   static const uint8_t oldCsf[kDCTBlockSize] = {
       10, 10, 20, 40, 60, 70, 80, 90,
       10, 20, 30, 60, 70, 80, 90, 90,
@@ -421,64 +414,52 @@ void Processor::ComputeBlockZeroingOrder(
   std::sort(input_order.begin(), input_order.end(),
             [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
               return a.second < b.second; });
-
-
-	coeff_t processed_block[kBlockSize];
-	memcpy(processed_block, block, sizeof(processed_block));
-
-	comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
-
-
+  coeff_t processed_block[kBlockSize];
+  memcpy(processed_block, block, sizeof(processed_block));
+  comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
   while (!input_order.empty()) {
     float best_err = 1e17f;
     int best_i = 0;
-    for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead, input_order.size()); ++i)
-    {
+    for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
+                                         input_order.size());
+         ++i) {
       coeff_t candidate_block[kBlockSize];
       memcpy(candidate_block, processed_block, sizeof(candidate_block));
-
       const int idx = input_order[i].first;
-
-      candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
-
+      candidate_block[idx] = 0;
       for (int c = 0; c < 3; ++c) {
         if (comp_mask & (1 << c)) {
-          img->component(c).SetCoeffBlock(block_x, block_y, &candidate_block[c * kDCTBlockSize]);
+          img->component(c).SetCoeffBlock(
+              block_x, block_y, &candidate_block[c * kDCTBlockSize]);
         }
       }
-
       float max_err = 0;
-
       for (int iy = 0; iy < factor_y; ++iy) {
         for (int ix = 0; ix < factor_x; ++ix) {
           int block_xx = block_x * factor_x + ix;
           int block_yy = block_y * factor_y + iy;
           if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
-            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy)); // TOBEREMOVE:��ԭͼ�Ķ�Ӧblock�Ƚϣ����ش���ֵ
+            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy, candidate_block));
             max_err = std::max(max_err, err);
           }
         }
       }
-
-      if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
+      if (max_err < best_err) {
         best_err = max_err;
         best_i = i;
       }
     }
-
     int idx = input_order[best_i].first;
     processed_block[idx] = 0;
     input_order.erase(input_order.begin() + best_i);
-
-    output_order->push_back({idx, best_err}); // TOBEREMOVE:����������������С�����idx����Ӧ���Ա�block�еĶ�Ӧλ����������Ϊ0,�Ƴ�input_order���ѡȡ��ǰֵ������output_order,����ʽ�����õ��Ա�ͼ����ȥ��
+    output_order->push_back({idx, best_err});
     for (int c = 0; c < 3; ++c) {
       if (comp_mask & (1 << c)) {
-        img->component(c).SetCoeffBlock(block_x, block_y, &processed_block[c * kDCTBlockSize]);
+        img->component(c).SetCoeffBlock(
+            block_x, block_y, &processed_block[c * kDCTBlockSize]);
       }
     }
   }
-
-  // TOBEREMOVE:�����Ƴ�err������error���Ƶ���أ�����ԭ�Ա�ͼ��ԭʼֵ��
   // Make the block error values monotonic.
   float min_err = 1e10;
   for (int i = output_order->size() - 1; i >= 0; --i) {
@@ -495,7 +476,8 @@ void Processor::ComputeBlockZeroingOrder(
   // Restore *img to the same state as it was at the start of this function.
   for (int c = 0; c < 3; ++c) {
     if (comp_mask & (1 << c)) {
-      img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]);
+      img->component(c).SetCoeffBlock(
+          block_x, block_y, &block[c * kDCTBlockSize]);
     }
   }
 }
@@ -741,7 +723,7 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
 
             candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
 
-            float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(candidate_block);
+            float max_err = 0;/// ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block);
             if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
                 best_err = max_err;
                 best_i = i;
@@ -777,6 +759,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                                        bool stop_early) {
   const int width = img->width();
   const int height = img->height();
+  const int ncomp = jpg.components.size();
   const int last_c = Log2FloorNonZero(comp_mask);
   if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
   const int factor_x = img->component(last_c).factor_x();
@@ -792,7 +775,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
   candidate_coeff_errors.reserve(60 * num_blocks);
   std::vector<CoeffData> block_order;
   block_order.reserve(3 * kDCTBlockSize);
-  comparator_->StartBlockComparisons(); // TOBEREMOVE:��ʼ��һЩ����
+  comparator_->StartBlockComparisons();
   for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
     for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
       coeff_t block[kBlockSize] = { 0 };
@@ -802,25 +785,25 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
           assert(img->component(c).factor_x() == factor_x);
           assert(img->component(c).factor_y() == factor_y);
           img->component(c).GetCoeffBlock(block_x, block_y,
-                                          &block[c * kDCTBlockSize]); // TOBEREMOVE:ȡ���Ա�ͼ��blockϵ��
+                                          &block[c * kDCTBlockSize]);
           const JPEGComponent& comp = jpg.components[c];
           int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
           memcpy(&orig_block[c * kDCTBlockSize],
                  &comp.coeffs[jpg_block_ix * kDCTBlockSize],
-                 kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:ȡ��ԭʼͼ��blockϵ��
+                 kDCTBlockSize * sizeof(orig_block[0]));
         }
       }
       block_order.clear();
       ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x,
-                               factor_y, comp_mask, img, &block_order); // TOBEREMOVE:����ԭʼblock�ͶԱ�ͼ��block����zeroing order����block_order
+                               factor_y, comp_mask, img, &block_order);
       candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
-      for (size_t i = 0; i < block_order.size(); ++i) { // TOBEREMOVE:�ѽ����ֵ����ѡϵ��
+      for (size_t i = 0; i < block_order.size(); ++i) {
         candidate_coeffs.push_back(block_order[i].idx);
         candidate_coeff_errors.push_back(block_order[i].block_err);
       }
     }
   }
-  comparator_->FinishBlockComparisons(); // TOBEREMOVE:�������
+  comparator_->FinishBlockComparisons();
   candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
 
   SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early,
@@ -908,8 +891,7 @@ void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
                                 global_order.push_back(std::make_pair(block_ix, val));
                             }
                             blocks_to_change += (last_index < num_candidates ? 1 : 0);
-                        }
-                        else {
+                        } else {
                             for (int i = last_index - 1; i >= 0; --i) {
                                 float val = ((max_err - candidate_errors[i]) /
                                     block_weight[block_ix]);
@@ -1119,14 +1101,11 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     img.ApplyGlobalQuantization(best_q);
 
     if (!downsample) {
-      //SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
-        SelectFrequencyMaskingBatch(jpg, &img, 1.0, false);
+        SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
       SelectFrequencyMasking(jpg, &img, 1, ymul, false);
       SelectFrequencyMasking(jpg, &img, 6, 1.0, true);
-//      SelectFrequencyMaskingBatch(jpg, &img, ymul, false);
-//      SelectFrequencyMaskingBatch(jpg, &img, 1.0, true);
     }
   }
 
diff --git a/guetzli/processor.h b/guetzli/processor.h
index c2beb7e0..b36b184e 100644
--- a/guetzli/processor.h
+++ b/guetzli/processor.h
@@ -26,6 +26,11 @@
 
 namespace guetzli {
 
+struct CoeffData {
+    int idx;
+    float block_err;
+};
+
 struct Params {
   float butteraugli_target = 1.0;
   bool clear_metadata = true;
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 73b78a05..288bee78 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -30,7 +30,6 @@
 //   * Blur - to hold the smoothing code
 
 #include "butteraugli/butteraugli.h"
-#include "clguetzli\clbutter_comparator.h"
 
 #include <assert.h>
 #include <math.h>
@@ -41,6 +40,7 @@
 #include <algorithm>
 #include <array>
 
+#include "clguetzli\clbutter_comparator.h"
 #include "clguetzli\clguetzli.h"
 #include "clguetzli\clguetzli_test.h"
 
@@ -64,30 +64,29 @@ inline double DotProduct(const float u[3], const double v[3]) {
 
 // Computes a horizontal convolution and transposes the result.
 void _Convolution(size_t xsize, size_t ysize,
-	size_t xstep,
-	size_t len, size_t offset,
-	const float* __restrict__ multipliers,
-	const float* __restrict__ inp,
-	float border_ratio,
-	float* __restrict__ result) {
+                        size_t xstep,
+                        size_t len, size_t offset,
+                        const float* __restrict__ multipliers,
+                        const float* __restrict__ inp,
+                        double border_ratio,
+                        float* __restrict__ result) {
   PROFILER_FUNC;
-  float weight_no_border = 0;
-
+  double weight_no_border = 0;
   for (size_t j = 0; j <= 2 * offset; ++j) {
     weight_no_border += multipliers[j];
   }
   for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) {
     int minx = x < offset ? 0 : x - offset;
     int maxx = std::min(xsize, x + len - offset) - 1;
-    float weight = 0.0;
+    double weight = 0.0;
     for (int j = minx; j <= maxx; ++j) {
       weight += multipliers[j - x + offset];
     }
     // Interpolate linearly between the no-border scaling and border scaling.
     weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-    float scale = 1.0 / weight;
+    double scale = 1.0 / weight;
     for (size_t y = 0; y < ysize; ++y) {
-      float sum = 0.0;
+      double sum = 0.0;
       for (int j = minx; j <= maxx; ++j) {
         sum += inp[y * xsize + j] * multipliers[j - x + offset];
       }
@@ -744,7 +743,6 @@ const double *GetOpsinAbsorbance() {
   return &kMix[0];
 }
 
-// mix��һ��[4x4]������in[,,,1]���в��
 void OpsinAbsorbance(const double in[3], double out[3]) {
   const double *mix = GetOpsinAbsorbance();
   out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3];
@@ -874,24 +872,6 @@ inline void ClenshawRecursion<0>(const double x, const double *coefficients,
   *b1 = x_b1 - (*b2) + coefficients[0];
 }
 
-void ClenshawRecursion_fun(const double x, const double *coefficients,
-	double *b1, double *b2, int n)
-{
-	if (n == 0) {
-		const double x_b1 = x * (*b1);
-		// The final iteration differs - no 2 * x_b1 here.
-		*b1 = x_b1 - (*b2) + coefficients[0];
-		return;
-	}
-
-	const double x_b1 = x * (*b1);
-	const double t = (x_b1 + x_b1) - (*b2) + coefficients[n];
-	*b2 = *b1;
-	*b1 = t;
-
-	ClenshawRecursion_fun(x, coefficients, b1, b2, n - 1);
-}
-
 // Rational polynomial := dividing two polynomial evaluations. These are easier
 // to find than minimax polynomials.
 struct RationalPolynomial {
@@ -900,9 +880,7 @@ struct RationalPolynomial {
                                    const double (&coefficients)[N]) {
     double b1 = 0.0;
     double b2 = 0.0;
-
     ClenshawRecursion<N - 1>(x, coefficients, &b1, &b2);
-
     return b1;
   }
 
@@ -1052,7 +1030,6 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
     const std::vector<std::vector<float>> &xyb0_arg,
     std::vector<std::vector<float>> &xyb1,
     std::vector<float> &result) {
-
   if (xsize_ < 8 || ysize_ < 8) return;
   auto xyb0 = xyb0_arg;
   {
@@ -1154,7 +1131,6 @@ void ButteraugliComparator::EdgeDetectorLowFreq(
     const std::vector<std::vector<float> > &xyb0,
     const std::vector<std::vector<float> > &xyb1,
     std::vector<float>* block_diff_ac) {
-
   PROFILER_FUNC;
   static const double kSigma = 14;
   static const double kMul = 10;
@@ -1214,12 +1190,10 @@ void ButteraugliComparator::CombineChannels(
     const std::vector<float>& block_diff_ac,
     const std::vector<float>& edge_detector_map,
     std::vector<float>* result) {
-
   PROFILER_FUNC;
   result->resize(res_xsize_ * res_ysize_);
-
   for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
-    for (size_t res_x = 0, j = 0; res_x + (8 - step_) < xsize_; res_x += step_, j++) {
+    for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) {
       size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
       double mask[3];
       double dc_mask[3];
@@ -1330,9 +1304,14 @@ double MaskDcB(double delta) {
   return InterpolateClampNegative(lut.data(), lut.size(), delta);
 }
 
+// Replaces values[x + y * xsize] with the minimum of the values in the
+// square_size square with coordinates
+//   x - offset .. x + square_size - offset - 1,
+//   y - offset .. y + square_size - offset - 1.
 void _MinSquareVal(size_t square_size, size_t offset,
 				  size_t xsize, size_t ysize,
                   float *values) {
+  PROFILER_FUNC;
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);

From 8c63e20b589df03a46fa541fcf4b79acca91394b Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 17 May 2017 10:15:54 +0800
Subject: [PATCH 082/189] =?UTF-8?q?=E5=AF=B9=E4=BA=8E8x8=E7=9A=84=E5=9D=97?=
 =?UTF-8?q?=EF=BC=8C=E6=9A=82=E6=97=B6=E4=B8=8D=E5=81=9Acheck=EF=BC=8C?=
 =?UTF-8?q?=E5=90=A6=E5=88=99=E9=80=9F=E5=BA=A6=E5=A4=AA=E6=85=A2=E4=BA=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp  | 46 +++++++++++++++---------------
 clguetzli/clguetzli_comparator.cpp |  2 +-
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 734e2c33..c61c8578 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -33,7 +33,7 @@ namespace butteraugli
     {
         ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
         {
             tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
@@ -49,7 +49,7 @@ namespace butteraugli
     {
         ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
         {
             tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
@@ -63,14 +63,14 @@ namespace butteraugli
         std::vector<float>* block_diff_ac)
     {
         std::vector<float> orign_ac;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
         {
             orign_ac = *block_diff_ac;
         }
 
         ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
         {
             tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
@@ -87,14 +87,14 @@ namespace butteraugli
         std::vector<float>* result)
     {
         std::vector<float> temp;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
         {
             temp = *result;
         }
 
         ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
         {
 			temp.resize(res_xsize_ * res_ysize_);
             tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
@@ -107,7 +107,7 @@ namespace butteraugli
     void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values)
     {
         std::vector<float> img;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             img.resize(xsize * ysize);
             memcpy(img.data(), values, xsize * ysize * sizeof(float));
@@ -116,7 +116,7 @@ namespace butteraugli
         _MinSquareVal(square_size, offset, xsize, ysize, values);
 
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
         }
@@ -125,14 +125,14 @@ namespace butteraugli
     void Average5x5(int xsize, int ysize, std::vector<float>* diffs)
     {
         std::vector<float> diffs_org;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             diffs_org = *diffs;
         }
 
         _Average5x5(xsize, ysize, diffs);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclAverage5x5(xsize, ysize, diffs_org, *diffs);
         }
@@ -142,7 +142,7 @@ namespace butteraugli
     {
         _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
         }
@@ -154,7 +154,7 @@ namespace butteraugli
         std::vector<std::vector<float> > *mask,
         std::vector<std::vector<float> > *mask_dc)
     {
-        if (g_useOpenCL)
+        if (g_useOpenCL && xsize > 100 && ysize > 100)
         {
             mask->resize(3);
             mask_dc->resize(3);
@@ -173,7 +173,7 @@ namespace butteraugli
 
         _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
@@ -188,14 +188,14 @@ namespace butteraugli
         std::vector<float>* diffmap)
     {
         std::vector<float> diffmap_org;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             diffmap_org = *diffmap;
         }
 
         _CalculateDiffmap(xsize, ysize, step, diffmap);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
         }
@@ -210,7 +210,7 @@ namespace butteraugli
     {
         _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
                 c1[0].data(), c1[1].data(), c1[2].data(),
@@ -223,14 +223,14 @@ namespace butteraugli
     void ScaleImage(double scale, std::vector<float> *result)
     {
         std::vector<float> result_org;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && result->size() > 64)
         {
             result_org = *result;
         }
 
         _ScaleImage(scale, result);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && result->size() > 64)
         {
             tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
         }
@@ -246,7 +246,7 @@ namespace butteraugli
     {
         _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
         }
@@ -256,7 +256,7 @@ namespace butteraugli
         double border_ratio)
     {
         std::vector<float> orignChannel;
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             orignChannel.resize(xsize * ysize);
             memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
@@ -264,7 +264,7 @@ namespace butteraugli
 
         _Blur(xsize, ysize, channel, sigma, border_ratio);
 
-        if (g_checkOpenCL)
+        if (g_checkOpenCL && xsize > 8 && ysize > 8)
         {
             tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
         }
@@ -284,14 +284,14 @@ namespace butteraugli
         else
         {
             std::vector< std::vector<float>> orig_rgb;
-            if (g_checkOpenCL)
+            if (g_checkOpenCL && xsize > 8 && ysize > 8)
             {
                 orig_rgb = rgb;
             }
 
             _OpsinDynamicsImage(xsize, ysize, rgb);
 
-            if (g_checkOpenCL)
+            if (g_checkOpenCL && xsize > 8 && ysize > 8)
             {
                 tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize,
                     rgb[0].data(), rgb[1].data(), rgb[2].data());
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 22b1965f..2b705d86 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -196,7 +196,7 @@ namespace guetzli
             double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block);
             if (err1 != err)
             {
-                LogError("Error: CompareBlock misstake.\n");
+                LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__);
             }
         }
 

From 6b8bebfeb2cceaa1109dc0d63d42d9379f947267 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 17 May 2017 20:45:02 +0800
Subject: [PATCH 083/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clguetzli.cl             | 337 +++++++++++++++++++----
 clguetzli/clguetzli.cpp            |  45 ++--
 clguetzli/clguetzli.h              |   8 +-
 clguetzli/clguetzli_comparator.cpp | 419 ++++++++++++++++++++++++++---
 clguetzli/clguetzli_comparator.h   |   9 +-
 clguetzli/ocl.cpp                  |   2 +-
 clguetzli/ocl.h                    |   2 +-
 guetzli/butteraugli_comparator.cc  |   4 +-
 guetzli/butteraugli_comparator.h   |   2 +-
 guetzli/comparator.h               |   2 +-
 guetzli/processor.cc               |  58 ++--
 11 files changed, 746 insertions(+), 142 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index e4565a90..ab595dde 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1993,7 +1993,7 @@ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
 	out[7] -= tmp1;
 }
 
-void CoeffToIDCT(coeff_t *block, uchar * out)
+void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
 {
 	coeff_t colidcts[kDCTBlockSize];
 	const int kColScale = 11;
@@ -2020,7 +2020,7 @@ void CoeffToIDCT(coeff_t *block, uchar * out)
 	}
 }
 
-void IDCTToImage(uchar *idct, ushort *pixels_)
+void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8])
 {
 	const int block_x = 0;
 	const int block_y = 0;
@@ -2040,7 +2040,7 @@ void IDCTToImage(uchar *idct, ushort *pixels_)
 	}
 }
 
-void ImageToYUV(ushort *pixels_, uchar *out)
+void PixelToYUV(const ushort pixels_[8*8], uchar out[8*8])
 {
 	const int stride = 3;
 
@@ -2242,7 +2242,7 @@ __constant static uchar kRangeLimitLut[4 * 256] = {
 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 };
 
-void YUVToRGB(uchar *pixelBlock)
+void YUVToRGB(__private uchar pixelBlock[3*8*8])
 {
 	__constant uchar* kRangeLimit = kRangeLimitLut + 384;
 	for (int i = 0; i < 64; i++)
@@ -2258,46 +2258,290 @@ void YUVToRGB(uchar *pixelBlock)
 	}
 }
 
+__constant static double kSrgb8ToLinearTable[256] = {
+	0.000000,
+	0.077399,
+	0.154799,
+	0.232198,
+	0.309598,
+	0.386997,
+	0.464396,
+	0.541796,
+	0.619195,
+	0.696594,
+	0.773994,
+	0.853367,
+	0.937509,
+	1.026303,
+	1.119818,
+	1.218123,
+	1.321287,
+	1.429375,
+	1.542452,
+	1.660583,
+	1.783830,
+	1.912253,
+	2.045914,
+	2.184872,
+	2.329185,
+	2.478910,
+	2.634105,
+	2.794824,
+	2.961123,
+	3.133055,
+	3.310673,
+	3.494031,
+	3.683180,
+	3.878171,
+	4.079055,
+	4.285881,
+	4.498698,
+	4.717556,
+	4.942502,
+	5.173584,
+	5.410848,
+	5.654341,
+	5.904108,
+	6.160196,
+	6.422649,
+	6.691512,
+	6.966827,
+	7.248640,
+	7.536993,
+	7.831928,
+	8.133488,
+	8.441715,
+	8.756651,
+	9.078335,
+	9.406810,
+	9.742115,
+	10.084290,
+	10.433375,
+	10.789410,
+	11.152432,
+	11.522482,
+	11.899597,
+	12.283815,
+	12.675174,
+	13.073712,
+	13.479465,
+	13.892470,
+	14.312765,
+	14.740385,
+	15.175366,
+	15.617744,
+	16.067555,
+	16.524833,
+	16.989614,
+	17.461933,
+	17.941824,
+	18.429322,
+	18.924460,
+	19.427272,
+	19.937793,
+	20.456054,
+	20.982090,
+	21.515934,
+	22.057618,
+	22.607175,
+	23.164636,
+	23.730036,
+	24.303404,
+	24.884774,
+	25.474176,
+	26.071642,
+	26.677203,
+	27.290891,
+	27.912736,
+	28.542769,
+	29.181020,
+	29.827520,
+	30.482299,
+	31.145387,
+	31.816813,
+	32.496609,
+	33.184802,
+	33.881422,
+	34.586499,
+	35.300062,
+	36.022139,
+	36.752760,
+	37.491953,
+	38.239746,
+	38.996169,
+	39.761248,
+	40.535013,
+	41.317491,
+	42.108710,
+	42.908697,
+	43.717481,
+	44.535088,
+	45.361546,
+	46.196882,
+	47.041124,
+	47.894297,
+	48.756429,
+	49.627547,
+	50.507676,
+	51.396845,
+	52.295078,
+	53.202402,
+	54.118843,
+	55.044428,
+	55.979181,
+	56.923129,
+	57.876298,
+	58.838712,
+	59.810398,
+	60.791381,
+	61.781686,
+	62.781338,
+	63.790363,
+	64.808784,
+	65.836627,
+	66.873918,
+	67.920679,
+	68.976937,
+	70.042715,
+	71.118037,
+	72.202929,
+	73.297414,
+	74.401516,
+	75.515259,
+	76.638668,
+	77.771765,
+	78.914575,
+	80.067122,
+	81.229428,
+	82.401518,
+	83.583415,
+	84.775142,
+	85.976722,
+	87.188178,
+	88.409534,
+	89.640813,
+	90.882037,
+	92.133229,
+	93.394412,
+	94.665609,
+	95.946841,
+	97.238133,
+	98.539506,
+	99.850982,
+	101.172584,
+	102.504334,
+	103.846254,
+	105.198366,
+	106.560693,
+	107.933256,
+	109.316077,
+	110.709177,
+	112.112579,
+	113.526305,
+	114.950375,
+	116.384811,
+	117.829635,
+	119.284868,
+	120.750532,
+	122.226647,
+	123.713235,
+	125.210317,
+	126.717914,
+	128.236047,
+	129.764737,
+	131.304005,
+	132.853871,
+	134.414357,
+	135.985483,
+	137.567270,
+	139.159738,
+	140.762907,
+	142.376799,
+	144.001434,
+	145.636832,
+	147.283012,
+	148.939997,
+	150.607804,
+	152.286456,
+	153.975971,
+	155.676371,
+	157.387673,
+	159.109900,
+	160.843070,
+	162.587203,
+	164.342319,
+	166.108438,
+	167.885578,
+	169.673761,
+	171.473005,
+	173.283330,
+	175.104755,
+	176.937299,
+	178.780982,
+	180.635824,
+	182.501843,
+	184.379058,
+	186.267489,
+	188.167154,
+	190.078073,
+	192.000265,
+	193.933749,
+	195.878543,
+	197.834666,
+	199.802137,
+	201.780975,
+	203.771198,
+	205.772826,
+	207.785876,
+	209.810367,
+	211.846319,
+	213.893748,
+	215.952674,
+	218.023115,
+	220.105089,
+	222.198615,
+	224.303711,
+	226.420395,
+	228.548685,
+	230.688599,
+	232.840156,
+	235.003373,
+	237.178269,
+	239.364861,
+	241.563167,
+	243.773205,
+	245.994993,
+	248.228549,
+	250.473890,
+	252.731035,
+	255.000000,
+};
+
 // chrisk todo
-void BlockToImage(coeff_t *block, float *r, float *g, float *b)
+void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8])
 {
-	uchar idct[8 * 8 * 3];
+	uchar idct[3][8 * 8];
 	CoeffToIDCT(&block[0], &idct[0]);
-	CoeffToIDCT(&block[8 * 8], &idct[8 * 8]);
-	CoeffToIDCT(&block[8 * 8 * 2], &idct[8 * 8 * 2]);
-
-	ushort pixels[8 * 8 * 3];
+	CoeffToIDCT(&block[8 * 8], &idct[1]);
+	CoeffToIDCT(&block[8 * 8 * 2], &idct[2]);
 
-	IDCTToImage(&idct[0], &pixels[0]);
-	IDCTToImage(&idct[8 * 8], &pixels[8 * 8]);
-	IDCTToImage(&idct[8 * 8 * 2], &pixels[8 * 8 * 2]);
+	ushort pixels[3][8 * 8];
+	IDCTToPixel(&idct[0], &pixels[0]);
+	IDCTToPixel(&idct[1], &pixels[1]);
+	IDCTToPixel(&idct[2], &pixels[2]);
 
 	uchar yuv[8 * 8 * 3];
-
-	ImageToYUV(&pixels[0], &yuv[0]);
-	ImageToYUV(&pixels[8 * 8], &yuv[1]);
-	ImageToYUV(&pixels[8 * 8 * 2], &yuv[2]);
+	PixelToYUV(&pixels[0], &yuv[0]);
+	PixelToYUV(&pixels[1], &yuv[1]);
+	PixelToYUV(&pixels[2], &yuv[2]);
 
 	YUVToRGB(yuv);
 
-	// Srgb8ToLinearTable begin
-	double lut[256];
-	int i = 0;
-	for (; i < 11; ++i)
-	{
-		lut[i] = i / 12.92;
-	}
-	for (; i < 256; ++i)
-	{
-		lut[i] = 255.0 * pow(((i / 255.0) + 0.055) / 1.055, 2.4);
-	}
-	// Srgb8ToLinearTable end
-
 	for (int i = 0; i < 8 * 8; i++)
 	{
-		r[i] = lut[yuv[3 * i]];
-		g[i] = lut[yuv[3 * i + 1]];
-		b[i] = lut[yuv[3 * i + 2]];
+		r[i] = kSrgb8ToLinearTable[yuv[3 * i]];
+		g[i] = kSrgb8ToLinearTable[yuv[3 * i + 1]];
+		b[i] = kSrgb8ToLinearTable[yuv[3 * i + 2]];
 	}
 }
 
@@ -2514,7 +2758,7 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block,
 
         MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2],
                                      rgb1[0], rgb1[1], rgb1[2],
-                                    rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2],
+                                     rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2],
                                      rgb1_c.ch[0], rgb1_c.ch[1], rgb1_c.ch[2],
                                      8, 8);
 
@@ -2548,25 +2792,20 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block,
 }
 
 // strong todo
-// orig_block_list [R....R][G....G][B....B]
-// block_list [R....R][G....G][B....B]
-// orig_image [RR..RRGG..GGBB..BB]
-// mask_scale[RGB]
-// output_orlder_list [3 * kBlockSize]
-
-__kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/,
-                                         __global coeff_t *block_list/*in*/,
-                                         __global float *orig_image/*in*/,
-                                         __global float *mask_scale/*in*/,
+// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
+__kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch,         // ԭʼͼ��ϵ��
+                                         __global const float   *orig_image_batch,   // ԭʼͼ��pregamma��
+                                         __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
+                                         __global const coeff_t *mayout_batch,       // �����ѡͼ��ϵ��
                                          float BlockErrorLimit,
                                          __global CoeffData *output_order_list/*out*/)
 {
     int block_idx = get_global_id(0);
 #define kComputeBlockSize (kBlockSize * 3)
 
-    __global coeff_t *orig_block     = orig_block_list + block_idx * kComputeBlockSize;
-    __global coeff_t *block          = block_list + block_idx * kComputeBlockSize;
-    __global float* orig_image_block = orig_image + block_idx * kComputeBlockSize;
+    __global coeff_t *orig_block       = orig_batch + block_idx * kComputeBlockSize;
+    __global coeff_t *mayout_block     = mayout_batch + block_idx * kComputeBlockSize;
+    __global float   *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize;
 
     DCTScoreData input_order_data[kComputeBlockSize];
     CoeffData    output_order_data[kComputeBlockSize];
@@ -2574,11 +2813,11 @@ __kernel void clComputeBlockZeroingOrder(__global coeff_t *orig_block_list/*in*/
     IntFloatPairList input_order  = { 0, input_order_data };
     IntFloatPairList output_order = { 0, output_order_data };
 
-    int count = MakeInputOrder(block, orig_block, &input_order, kBlockSize);
+    int count = MakeInputOrder(mayout_block, orig_block, &input_order, kBlockSize);
 
     coeff_t processed_block[kComputeBlockSize];
     for (int i = 0; i < kComputeBlockSize; i++) {
-        processed_block[i] = block[i];
+        processed_block[i] = mayout_block[i];
     }
 
     while (input_order.size > 0)
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index d2f01f3e..5ab406e7 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1194,9 +1194,14 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	clReleaseMemObject(mem_result);
 }
 
-void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch,
-                              float *orig_image, float* mask_scale, guetzli::CoeffData *output_order_batch,
-                              int size, float BlockErrorLimit)
+// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
+void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch,     // ԭʼͼ��ϵ��
+                                const float *orig_image_batch,          // ԭʼͼ��pregamma��
+                                const float* orig_mask_scale_batch,     // ԭʼͼ���ĳ�����ز���
+                                const guetzli::coeff_t *mayout_batch,   // �����ѡͼ��ϵ��
+                                int size,                               //
+                                float BlockErrorLimit,
+                                guetzli::CoeffData *output_order_batch) //
 {
     using namespace guetzli;
 
@@ -1205,20 +1210,20 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coe
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
 
-    cl_mem mem_orig_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, orig_block_batch);
-    cl_mem mem_block_list = ocl.allocMem(sizeof(coeff_t) * item_count, block_batch);
-    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * item_count, orig_image);
-    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * size, mask_scale);
-    cl_mem mem_output_order_list = ocl.allocMem(sizeof(CoeffData) * item_count);
+    cl_mem mem_orig_batch         = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch);
+    cl_mem mem_orig_image_batch   = ocl.allocMem(sizeof(float) * item_count, orig_image_batch);
+    cl_mem mem_mask_scale_batch   = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch);
+    cl_mem mem_mayout_batch       = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch);
+    cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
     cl_float clBlockErrorLimit = BlockErrorLimit;
 
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_block_list);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_block_list);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_image);
-    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mask_scale);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch);
     clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit);
-    clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_list);
+    clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch);
 
     size_t globalWorkSize[1] = { size };
     err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -1232,17 +1237,17 @@ void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coe
         LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
     }
 
-    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_list, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
+    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
     err = clFinish(ocl.commandQueue);
     memcpy(output_order_batch, result, sizeof(CoeffData) * item_count);
 
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_list, result, sizeof(CoeffData) * item_count, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL);
     clFinish(ocl.commandQueue);
 
-    clReleaseMemObject(mem_orig_block_list);
-    clReleaseMemObject(mem_block_list);
-    clReleaseMemObject(mem_orig_image);
-    clReleaseMemObject(mem_mask_scale);
-    clReleaseMemObject(mem_output_order_list);
+    clReleaseMemObject(mem_orig_batch);
+    clReleaseMemObject(mem_orig_image_batch);
+    clReleaseMemObject(mem_mask_scale_batch);
+    clReleaseMemObject(mem_mayout_batch);
+    clReleaseMemObject(mem_output_order_batch);
 
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 457de4a0..459110de 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -15,7 +15,13 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
-void clComputeBlockZeroingOrder(guetzli::coeff_t *orig_block_batch, guetzli::coeff_t *block_batch, float *orig_iamge, float* mask_scale, guetzli::CoeffData *output_order_batch, int size, float BlockErrorLimit);
+void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch,
+    const float *orig_image_batch,
+    const float* orig_mask_scale_batch,
+    const guetzli::coeff_t *mayout_batch,
+    int size,
+    float BlockErrorLimit,
+    guetzli::CoeffData *output_order_batch);
 
 void clMask(const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2,
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 2b705d86..deaf4cdb 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -9,12 +9,12 @@
 
 using namespace guetzli;
 
-void CoeffToIDCT(const coeff_t *block, uint8_t *idct)
+void CoeffToIDCT(const coeff_t block[8*8], uint8_t idct[8*8])
 {
 	guetzli::ComputeBlockIDCT(block, idct);
 }
 
-void IDCTToPixel(const uint8_t idct[8 * 8], uint16_t *pixels_)
+void IDCTToPixel8x8(const uint8_t idct[8 * 8], uint16_t pixels_[8*8])
 {
 	const int block_x = 0;
 	const int block_y = 0;
@@ -32,14 +32,85 @@ void IDCTToPixel(const uint8_t idct[8 * 8], uint16_t *pixels_)
 	}
 }
 
+void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_[16*16])
+{
+    const int block_x = 0;
+    const int block_y = 0;
+    const int width_  = 16;
+    const int height_ = 16;
+
+    // Fill in the 10x10 pixel area in the subsampled image that will be the
+    // basis of the upsampling. This area is enough to hold the 3x3 kernel of
+    // the fancy upsampler around each pixel.
+    static const int kSubsampledEdgeSize = 10;
+    uint16_t subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize];
+    for (int j = 0; j < kSubsampledEdgeSize; ++j) {
+        // The order we fill in the rows is:
+        //   8 rows intersecting the block, row below, row above
+        const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2);
+        for (int i = 0; i < kSubsampledEdgeSize; ++i) {
+            // The order we fill in each row is:
+            //   8 pixels within the block, left edge, right edge
+            const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) +
+                (i < 9 ? i + 1 : 0));
+            const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2);
+            if (x0 < 0) {
+                subsampled[ix] = subsampled[ix + 1];
+            }
+            else if (y0 < 0) {
+                subsampled[ix] = subsampled[ix + kSubsampledEdgeSize];
+            }
+            else if (x0 >= width_) {
+                subsampled[ix] = subsampled[ix - 1];
+            }
+            else if (y0 >= height_) {
+                subsampled[ix] = subsampled[ix - kSubsampledEdgeSize];
+            }
+            else if (i < 8 && j < 8) {
+                subsampled[ix] = idct[j * 8 + i] << 4;
+            }
+            else {
+                // Reconstruct the subsampled pixels around the edge of the current
+                // block by computing the inverse of the fancy upsampler.
+                const int y1 = std::max(y0 - 1, 0);
+                const int x1 = std::max(x0 - 1, 0);
+                subsampled[ix] = (pixels_[y0 * width_ + x0] * 9 +
+                    pixels_[y1 * width_ + x1] +
+                    pixels_[y0 * width_ + x1] * -3 +
+                    pixels_[y1 * width_ + x0] * -3) >> 2;
+            }
+        }
+    }
+
+    // Determine area to update.
+    int xmin = std::max(block_x * 16 - 1, 0);
+    int xmax = std::min(block_x * 16 + 16, width_ - 1);
+    int ymin = std::max(block_y * 16 - 1, 0);
+    int ymax = std::min(block_y * 16 + 16, height_ - 1);
+
+    // Apply the fancy upsampler on the subsampled block.
+    for (int y = ymin; y <= ymax; ++y) {
+        const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize;
+        const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize;
+        uint16_t* rowptr = &pixels_[y * width_];
+        for (int x = xmin; x <= xmax; ++x) {
+            const int x0 = (x & ~1) / 2 - block_x * 8 + 1;
+            const int dx = (x & 1) * 2 - 1;
+            const int ix = x0 + y0;
+            rowptr[x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
+                subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4;
+        }
+    }
+}
+
 // out = [YUVYUV....YUVYUV]
-void PixelToYUV(uint16_t *pixels_, uint8_t *out)
+void PixelToYUV(uint16_t pixels_[8*8], uint8_t out[8*8], int xsize = 8, int ysize = 8)
 {
 	const int stride = 3;
 
-	for (int y = 0; y < 8; ++y) {
-		for (int x = 0; x < 8; ++x) {
-            int px = y * 8 + x;
+	for (int y = 0; y < xsize; ++y) {
+		for (int x = 0; x < ysize; ++x) {
+            int px = y * xsize + x;
 			*out = static_cast<uint8_t>((pixels_[px] + 8 - (x & 1)) >> 4);
             out += stride;
 		}
@@ -47,9 +118,9 @@ void PixelToYUV(uint16_t *pixels_, uint8_t *out)
 }
 
 // pixel = [YUVYUV...YUVYUV] to [RGBRGB...RGBRGB]
-void YUVToRGB(uint8_t* pixelBlock)
+void YUVToRGB(uint8_t pixelBlock[3*8*8], int size = 8 * 8)
 {
-	for (int i = 0; i < 64; i++)
+	for (int i = 0; i < size; i++)
 	{
 		uint8_t *pixel = &pixelBlock[i*3];
 
@@ -62,8 +133,42 @@ void YUVToRGB(uint8_t* pixelBlock)
 	}
 }
 
+void YUVToImage(uint8_t yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize = 8, int ysize = 8, int inside_x = 8, int inside_y = 8)
+{
+    YUVToRGB(yuv, xsize * ysize);
+
+    const double* lut = Srgb8ToLinearTable();
+
+    for (int i = 0; i < xsize * ysize; i++)
+    {
+        r[i] = lut[yuv[3 * i]];
+        g[i] = lut[yuv[3 * i + 1]];
+        b[i] = lut[yuv[3 * i + 2]];
+    }
+    for (int y = 0; y < inside_y; y++)
+    {
+        for (int x = inside_x; x < xsize; x++)
+        {
+            int idx = y * 8 + (inside_x - 1);
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+    for (int y = inside_y; y < ysize; y++)
+    {
+        for (int x = 0; x < xsize; x++)
+        {
+            int idx = (inside_y - 1) * 8 + x;
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+}
+
 // block = [R....R][G....G][B.....]
-void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside_x, int inside_y)
+void BlockToImage(const coeff_t block[8*8*3], float* r, float* g, float* b, int inside_x, int inside_y)
 {
 	uint8_t idct[3][8 * 8];
 	CoeffToIDCT(&block[0], idct[0]);
@@ -71,13 +176,11 @@ void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside
 	CoeffToIDCT(&block[8 * 8 * 2], idct[2]);
 
     uint16_t pixels[3][8 * 8];
-
-	IDCTToPixel(idct[0], pixels[0]);
-	IDCTToPixel(idct[1], pixels[1]);
-	IDCTToPixel(idct[2], pixels[2]);
+	IDCTToPixel8x8(idct[0], pixels[0]);
+	IDCTToPixel8x8(idct[1], pixels[1]);
+	IDCTToPixel8x8(idct[2], pixels[2]);
 
 	uint8_t yuv[8 * 8 * 3];
-
 	PixelToYUV(pixels[0], &yuv[0]);
 	PixelToYUV(pixels[1], &yuv[1]);
 	PixelToYUV(pixels[2], &yuv[2]);
@@ -114,6 +217,87 @@ void BlockToImage(const coeff_t *block, float* r, float* g, float* b, int inside
     }
 }
 
+void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv)
+{
+    uint8_t idct[8 * 8];
+    CoeffToIDCT(&block[0], &idct[0]);
+
+    uint16_t pixels[16 * 16];
+    IDCTToPixel16x16(idct, pixels);
+
+    PixelToYUV(pixels, yuv, 16, 16);
+}
+
+void CoeffToYUV8x8(const coeff_t block[8 * 8], uint8_t *yuv)
+{
+    uint8_t idct[8 * 8];
+    CoeffToIDCT(&block[0], &idct[0]);
+
+    uint16_t pixels[8 * 8];
+    IDCTToPixel8x8(idct, pixels);
+
+    PixelToYUV(pixels, yuv);
+}
+
+void Copy8x8To16x16(const uint8_t yuv8x8[3 * 8 * 8], uint8_t yuv16x16[3 * 16 * 16], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            yuv16x16[idx16 * 3] = yuv8x8[idx * 3];
+        }
+    }
+}
+
+void Copy16x16To8x8(const uint8_t yuv16x16[3 * 16 * 16], uint8_t yuv8x8[3 * 8 * 8], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            yuv8x8[idx * 3] = yuv16x16[idx16 * 3];
+        }
+    }
+}
+
+void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            r[idx] = rgb16x16[0][idx16];
+            g[idx] = rgb16x16[1][idx16];
+            b[idx] = rgb16x16[2][idx16];
+        }
+    }
+}
+
+typedef struct __channel_info_t
+{
+    int factor;
+    int block_width;
+    int block_height;
+}channel_info;
+
+void ComputeBlockFacor(const coeff_t* candidate_block,
+                       const coeff_t * mayout_coeff[3],
+                       const channel_info mayout_channel[3],
+                       const coeff_t * orig_coeff[3],
+                       const int comp_mask,
+                       int factor
+)
+{
+
+}
+
 namespace guetzli
 {
 	ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
@@ -188,12 +372,12 @@ namespace guetzli
         ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
     }
 
-    double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const
+    double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
-        double err = CompareBlockEx(img, off_x, off_y, candidate_block);
+        double err = CompareBlockEx2(img, off_x, off_y, candidate_block, comp_mask);
         if (g_checkOpenCL)
         {
-            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block);
+            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
             if (err1 != err)
             {
                 LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__);
@@ -247,38 +431,176 @@ namespace guetzli
         }
 */
         // �����Ǽ��㹤��
+        return ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(off_x, off_y));
+	}
+
+    int ButteraugliComparatorEx::GetOrigBlock(std::vector< std::vector<float> > &rgb0_c, int off_x, int off_y) const
+    {
+        int block_xx = block_x_ * factor_x_ + off_x;
+        int block_yy = block_y_ * factor_y_ + off_y;
+        if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1;
+
+        const int block8_width = (width_ + 8 - 1) / 8;
+
+        int block_ix = block_yy * block8_width + block_xx;
+
+        rgb0_c.resize(3);
+        const float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+        for (int i = 0; i < 3; i++)
+        {
+            rgb0_c[i].resize(kDCTBlockSize);
+            memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float));
+        }
+
+        return block_ix;
+    }
+
+    double ButteraugliComparatorEx::CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
+    {
+        const int block_x = block_x_;
+        const int block_y = block_y_;
+        const int factor = factor_x_;
+
+        const coeff_t *candidate_channel[3];
+        channel_info mayout_channel[3];
+        const coeff_t *mayout_coeff[3];
+        for (int c = 0; c < 3; c++)
+        {
+            candidate_channel[c] = &candidate_block[c * 8 * 8];
+            mayout_coeff[c] = img.component(c).coeffs();
+            mayout_channel[c].block_height = img.component(c).height_in_blocks();
+            mayout_channel[c].block_width  = img.component(c).width_in_blocks();
+            mayout_channel[c].factor       = img.component(c).factor_x();
+        }
+
+        uint8_t yuv16x16[3 * 16 * 16];  // factor 2 mode output image
+        uint8_t yuv8x8[3 * 8 * 8];      // factor 1 mode output image
+
+        // ����comp_mask��Σ�ת��ΪRGB������Ҫ��
+        for (int c = 0; c < 3; c++)
+        {
+            if (mayout_channel[c].factor == 1) {
+                if (factor == 1) {  // channel_factor == factor ˵��Ҫ�������㣬����candidate�е�ϵ��
+                    //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x;
+                    const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8;
+                    CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
+                }
+                else {
+                    for (int iy = 0; iy < factor; ++iy) {
+                        for (int ix = 0; ix < factor; ++ix) {
+                            int block_xx = block_x * factor + ix;
+                            int block_yy = block_y * factor + iy;
+
+                            int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
+                            const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8;
+                            CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
+
+                            // copy YUV8x8 to YUV1616 corner
+                            Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy);
+                        }
+                    }
+                }
+            }
+            else {
+                if (factor == 1) {
+                    int block_xx = block_x / mayout_channel[c].factor;
+                    int block_yy = block_y / mayout_channel[c].factor;
+                    int ix = block_x % mayout_channel[c].factor;;
+                    int iy = block_y % mayout_channel[c].factor;
+
+                    int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
+                    const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8;
+                    CoeffToYUV16x16(coeff_block, &yuv16x16[c]);
+
+                    // copy YUV16x16 corner to YUV8x8
+                    Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
+                }
+                else {
+                    //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x;
+                    const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8;
+                    CoeffToYUV16x16(coeff_block, &yuv16x16[c]);
+                }
+            }
+        }
+
+        if (factor == 1)
+        {
+            int block_ix = getCurrentBlockIdx();
+            const float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+            std::vector< std::vector<float> > rgb0_c;
+            int block_8x8idx = GetOrigBlock(rgb0_c, 0, 0);
+
+            std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+            YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data());
+
+            double err = 0;// ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
+
+            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+
+            return err;
+        }
+        else
+        {
+            float rgb16x16[3][16 * 16];
+            YUVToImage(yuv8x8, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, 16, 16);
+
+            float max_err = 0;
+           // for (int iy = 0; iy < factor; ++iy)
+            {
+                //for (int ix = 0; ix < factor; ++ix)
+                {
+                    int ix = off_x;
+                    int iy = off_y;
+                    std::vector< std::vector<float> > rgb0_c;
+                    int block_8x8idx = GetOrigBlock(rgb0_c, ix, iy);
+                    if (block_8x8idx < 0) return max_err;// continue;
+
+                    std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+                    Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), ix, iy);
+
+                    float err = ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(0, 0));
+                    max_err = std::max(max_err, err);
+                }
+            }
+            return max_err;
+        }
+    }
+
+    double ButteraugliComparatorEx::ComputeImage8x8Block(std::vector<std::vector<float> > &rgb0_c,
+        std::vector<std::vector<float> > &rgb1_c,
+        int block_8x8idx) const
+    {
         ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c);
-		::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
+        ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
 
-		std::vector<std::vector<float> > rgb0 = rgb0_c;
-		std::vector<std::vector<float> > rgb1 = rgb1_c;
+        std::vector<std::vector<float> > rgb0 = rgb0_c;
+        std::vector<std::vector<float> > rgb1 = rgb1_c;
 
-		::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1);
+        ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1);
 
-		double b0[3 * kDCTBlockSize];
-		double b1[3 * kDCTBlockSize];
-		for (int c = 0; c < 3; ++c) {
-			for (int ix = 0; ix < kDCTBlockSize; ++ix) {
-				b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
-				b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
-			}
-		}
-		double diff_xyz_dc[3] = { 0.0 };
-		double diff_xyz_ac[3] = { 0.0 };
-		double diff_xyz_edge_dc[3] = { 0.0 };
-		::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
-
-		double diff = 0.0;
-		double diff_edge = 0.0;
-		for (int c = 0; c < 3; ++c) {
-            diff      += diff_xyz_dc[c]      * imgMaskXyzScaleBlockList[block_ix * 3 + c];
-            diff      += diff_xyz_ac[c]      * imgMaskXyzScaleBlockList[block_ix * 3 + c];
-            diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_ix * 3 + c];
-		}
+        double b0[3 * kDCTBlockSize];
+        double b1[3 * kDCTBlockSize];
+        for (int c = 0; c < 3; ++c) {
+            for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+                b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+                b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+            }
+        }
+        double diff_xyz_dc[3] = { 0.0 };
+        double diff_xyz_ac[3] = { 0.0 };
+        double diff_xyz_edge_dc[3] = { 0.0 };
+        ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+        double diff = 0.0;
+        double diff_edge = 0.0;
+        for (int c = 0; c < 3; ++c) {
+            diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c];
+            diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c];
+            diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c];
+        }
         const double kEdgeWeight = 0.05;
-		return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
-	}
-
+        return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+    }
 
     int ButteraugliComparatorEx::getCurrentBlockIdx(void) const
     {
@@ -287,4 +609,13 @@ namespace guetzli
 
         return block_y_ * block_width + block_x_;
     }
+
+    int ButteraugliComparatorEx::getCurrentBlock8x8Idx(int off_x, int off_y) const
+    {
+        int block_xx = block_x_ * factor_x_ + off_x;
+        int block_yy = block_y_ * factor_y_ + off_y;
+
+        const int block8_width =  (width_ + 8 - 1) / 8;
+        return block_yy * block8_width + block_xx;
+    }
 }
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
index 7f3a768c..840254a7 100644
--- a/clguetzli/clguetzli_comparator.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -15,10 +15,17 @@ namespace guetzli {
         void FinishBlockComparisons() override;
 		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
 
-        double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const override;
+        double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
 		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const;
+        double CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const;
     private:
+        int    GetOrigBlock(std::vector< std::vector<float> > &rgb0_c, int off_x, int off_y) const;
+        double ComputeImage8x8Block(std::vector<std::vector<float> > &rgb0_c,
+                                    std::vector<std::vector<float> > &rgb1_c,
+                                    int block_8x8idx) const;
+
         int getCurrentBlockIdx(void) const;
+        int getCurrentBlock8x8Idx(int off_x, int off_y) const;
 	public:
 		std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 2f114e02..7be57d49 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -189,7 +189,7 @@ void* ocl_args_d_t::allocC(size_t s)
 	return outputC;
 }
 
-cl_mem ocl_args_d_t::allocMem(size_t s, void *init)
+cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
 {
 	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
 	cl_int err = 0;
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 59b4582d..bcc8ef9c 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -95,7 +95,7 @@ struct ocl_args_d_t
 	void* allocB(size_t s);
 	void* allocC(size_t s);
 
-	cl_mem allocMem(size_t s, void *init = NULL);
+	cl_mem allocMem(size_t s, const void *init = NULL);
 	ocl_channels allocMemChannels(size_t s);
     void releaseMemChannels(ocl_channels rgb);
 
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index accb905f..124aea8d 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -97,7 +97,9 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y,
 }
 
 double ButteraugliComparator::CompareBlock(const OutputImage& img,
-                                           int off_x, int off_y, const coeff_t* candidate_block) const {
+                                           int off_x, int off_y,
+                                           const coeff_t* candidate_block,
+                                           const int comp_mask) const {
   int block_x = block_x_ * factor_x_ + off_x;
   int block_y = block_y_ * factor_y_ + off_y;
   int xmin = 8 * block_x;
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index 572a9689..5418c0d2 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -45,7 +45,7 @@ class ButteraugliComparator : public Comparator {
                    int factor_x, int factor_y) override;
 
   double CompareBlock(const OutputImage& img,
-                      int off_x, int off_y, const coeff_t* candidate_block) const override;
+                      int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
 
   double ScoreOutputSize(int size) const override;
 
diff --git a/guetzli/comparator.h b/guetzli/comparator.h
index db76ac77..061f9603 100644
--- a/guetzli/comparator.h
+++ b/guetzli/comparator.h
@@ -51,7 +51,7 @@ class Comparator {
   // the resulting per-block distance. The interpretation of the returned
   // distance depends on the comparator used.
   virtual double CompareBlock(const OutputImage& img,
-                              int off_x, int off_y, const coeff_t* candidate_block) const = 0;
+                              int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const = 0;
 
   // Returns the combined score of the output image in the last Compare() call
   // (or the baseline image, if Compare() was not called yet), based on output
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index e3bd4be4..e4b616e4 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -53,7 +53,9 @@ class Processor {
 
  private:
 
-     void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early);
+  void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img,
+                              const uint8_t comp_mask, const double target_mul,
+                              bool stop_early);
 
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
@@ -352,6 +354,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
   const float target_mul_low = 0.95f;
 
   QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q, img);
+/*
   for (;;) {
     int q_next[3][kDCTBlockSize];
     if (!qgen.GetNext(q_next)) {
@@ -367,7 +370,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
       }
     }
   }
-
+*/
   memcpy(&best_q[0][0], &best.q[0][0], kBlockSize * sizeof(best_q[0][0]));
   GUETZLI_LOG(stats_, "\n%s selected quantization matrix:\n",
               downsample ? "YUV420" : "YUV444");
@@ -439,7 +442,7 @@ void Processor::ComputeBlockZeroingOrder(
           int block_xx = block_x * factor_x + ix;
           int block_yy = block_y * factor_y + iy;
           if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
-            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy, candidate_block));
+            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask));
             max_err = std::max(max_err, err);
           }
         }
@@ -552,8 +555,21 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 
 }  // namespace
 
-void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const double target_mul, bool stop_early)
+void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask,
+                                       const double target_mul, bool stop_early)
 {
+    const int ncomp = jpg.components.size();
+    if (ncomp != 3) return;
+
+    std::vector<coeff_t> block_[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_height = img->component(c).width_in_blocks();
+        int block_width = img->component(c).height_in_blocks();
+
+        block_[c].resize(block_height * block_width);
+    }
+
     // we only support factor_x == factor_y == 1
     const int width = img->width();
     const int height = img->height();
@@ -564,19 +580,19 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
     const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
     const int num_blocks = block_width * block_height;
 
-    comparator_->StartBlockComparisons(); // TOBEREMOVE:��ʼ��һЩ����
-    std::vector<coeff_t> orig_block_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
-    std::vector<coeff_t> block_batch(num_blocks * kBlockSize);        // [block_r block_g block_b]
+    comparator_->StartBlockComparisons(); // ��ʼ��һЩ��������Ҫ�Ƕ�ԭͼ����һЩ����
+    std::vector<coeff_t> orig_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
+    std::vector<coeff_t> mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b]
 
     // step 1 ��ȡ����block list
     for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
         for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-            coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
-            coeff_t *block = &block_batch[block_ix * kBlockSize];
+            coeff_t *orig_block   = &orig_batch[block_ix * kBlockSize];
+            coeff_t *mayout_block = &mayout_batch[block_ix * kBlockSize];
 
             for (int c = 0; c < 3; ++c)
             {
-                img->component(c).GetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]);
+                img->component(c).GetCoeffBlock(block_x, block_y, &mayout_block[c * kDCTBlockSize]);
 
                 const JPEGComponent& comp = jpg.components[c];
                 int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
@@ -595,15 +611,13 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
     {
         output_order_gpu.resize(num_blocks * kBlockSize);
         output_order = output_order_gpu.data();
-        clComputeBlockZeroingOrder(orig_block_batch.data(),
-                                    block_batch.data(),
-                                    comp->imgOpsinDynamicsBlockList.data(),
-                                    comp->imgMaskXyzScaleBlockList.data(),
-                                    output_order_gpu.data(),
-                                    num_blocks,
-                                    comparator_->BlockErrorLimit());
-
-
+        clComputeBlockZeroingOrder(orig_batch.data(),
+                                   comp->imgOpsinDynamicsBlockList.data(),
+                                   comp->imgMaskXyzScaleBlockList.data(),
+                                   mayout_batch.data(),
+                                   num_blocks,
+                                   comparator_->BlockErrorLimit(),
+                                   output_order_gpu.data());
     }
     if (!g_useOpenCL || g_checkOpenCL)
     {
@@ -611,8 +625,8 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
         output_order = output_order_cpu.data();
         for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
             for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-                coeff_t *orig_block = &orig_block_batch[block_ix * kBlockSize];
-                coeff_t *block = &block_batch[block_ix * kBlockSize];
+                coeff_t *orig_block = &orig_batch[block_ix * kBlockSize];
+                coeff_t *block      = &mayout_batch[block_ix * kBlockSize];
 
                 std::vector<CoeffData> block_order;
                 ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
@@ -1101,7 +1115,7 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     img.ApplyGlobalQuantization(best_q);
 
     if (!downsample) {
-        SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
+      SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
       SelectFrequencyMasking(jpg, &img, 1, ymul, false);

From 6482c6784c651d651e8e355834348f0a770efc07 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 18 May 2017 20:03:49 +0800
Subject: [PATCH 084/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AE=BF=E9=97=AE?=
 =?UTF-8?q?=E6=8E=A5=E5=8F=A3=EF=BC=8C=E4=B8=BB=E8=A6=81=E7=94=A8=E4=BA=8E?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=A0=A1=E9=AA=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli/output_image.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/guetzli/output_image.h b/guetzli/output_image.h
index 1018eeac..9c9f935a 100644
--- a/guetzli/output_image.h
+++ b/guetzli/output_image.h
@@ -37,6 +37,8 @@ class OutputImageComponent {
   int width_in_blocks() const { return width_in_blocks_; }
   int height_in_blocks() const { return height_in_blocks_; }
   const coeff_t* coeffs() const { return &coeffs_[0]; }
+  const uint16_t* pixels() const { return &pixels_[0]; }
+  size_t pixels_size() const { return pixels_.size(); }
   const int* quant() const { return &quant_[0]; }
   bool IsAllZero() const;
 

From c5a08a1b84e7f471bc6fb656ffb64e645174b32c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 18 May 2017 20:06:01 +0800
Subject: [PATCH 085/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=A0=A1=E9=AA=8C?=
 =?UTF-8?q?=E5=8E=9F=E5=9B=BE=E6=95=B0=E6=8D=AE=E5=8F=98=E5=8C=96=E7=9A=84?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=EF=BC=8Cto=20be=20delete?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli/processor.cc | 98 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 10 deletions(-)

diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index e4b616e4..ffaf025d 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -59,7 +59,7 @@ class Processor {
 
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
-                              bool stop_early);
+                              bool stop_early, const OutputImage &img2);
 
   void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
       const uint8_t comp_mask,
@@ -72,7 +72,7 @@ class Processor {
   void ComputeBlockZeroingOrder(
       const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
       const int block_x, const int block_y, const int factor_x,
-      const int factor_y, const uint8_t comp_mask, OutputImage* img,
+      const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2,
       std::vector<CoeffData>* output_order);
 
   void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
@@ -354,7 +354,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
   const float target_mul_low = 0.95f;
 
   QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q, img);
-/*
+
   for (;;) {
     int q_next[3][kDCTBlockSize];
     if (!qgen.GetNext(q_next)) {
@@ -370,7 +370,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
       }
     }
   }
-*/
+
   memcpy(&best_q[0][0], &best.q[0][0], kBlockSize * sizeof(best_q[0][0]));
   GUETZLI_LOG(stats_, "\n%s selected quantization matrix:\n",
               downsample ? "YUV420" : "YUV444");
@@ -383,7 +383,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
 void Processor::ComputeBlockZeroingOrder(
     const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
     const int block_x, const int block_y, const int factor_x,
-    const int factor_y, const uint8_t comp_mask, OutputImage* img,
+    const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage &img2,
     std::vector<CoeffData>* output_order) {
   static const uint8_t oldCsf[kDCTBlockSize] = {
       10, 10, 20, 40, 60, 70, 80, 90,
@@ -420,6 +420,19 @@ void Processor::ComputeBlockZeroingOrder(
   coeff_t processed_block[kBlockSize];
   memcpy(processed_block, block, sizeof(processed_block));
   comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
+
+  bool bCheck = false;
+  uint8_t orig_rgb[3][16 * 16] = { 0 };
+  if (bCheck)
+  {
+      for (int c = 0; c < 3; ++c) {
+          if (comp_mask & (1 << c) && factor_x == 2) {
+              if ((block_x + 1) * factor_x * 8 > img->width()) continue;
+              img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, orig_rgb[c], 1);
+          }
+      }
+  }
+
   while (!input_order.empty()) {
     float best_err = 1e17f;
     int best_i = 0;
@@ -451,6 +464,36 @@ void Processor::ComputeBlockZeroingOrder(
         best_err = max_err;
         best_i = i;
       }
+
+      if (bCheck)
+      {
+          // ÿ�ζ�Ҫ�ָ�һ�¿���
+          for (int c = 0; c < 3; ++c) {
+              if (comp_mask & (1 << c)) {
+                  img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]);
+              }
+          }
+          // �������ٿ��ǲ��ǻָ���
+          uint8_t last_rgb[3][16 * 16] = { 0 };
+          for (int c = 0; c < 3; ++c) {
+              if (comp_mask & (1 << c) && factor_x == 2) {
+                  if ((block_x + 1) * factor_x * 8 > img->width()) continue;
+                  img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, last_rgb[c], 1);
+              }
+          }
+          int count = 0;
+          for (int c = 0; c < 3; c++) {
+              for (int k = 0; factor_x == 2 && k < 16 * 16; k++) {
+                  if (last_rgb[c][k] != orig_rgb[c][k]) {
+                      count++;
+                  }
+              }
+          }
+          if (count > 0)
+          {
+              LogError("misstake in processing %d:%d block=%d:%d\r\n", count, 16 * 16, block_x, block_y);
+          }
+      }
     }
     int idx = input_order[best_i].first;
     processed_block[idx] = 0;
@@ -483,6 +526,23 @@ void Processor::ComputeBlockZeroingOrder(
           block_x, block_y, &block[c * kDCTBlockSize]);
     }
   }
+
+  if (bCheck)
+  {
+      // ȫͼ���һ��
+      for (int c = 0; c < 3; c++)
+      {
+          int size = img->component(c).pixels_size();
+          if (!(comp_mask & (1 << c))) continue;
+          for (int k = 0; k < size && factor_x == 2; k++)
+          {
+              if (img2.component(c).pixels()[k] != img->component(c).pixels()[k])
+              {
+                  LogError("misstake in restore\r\n");
+              }
+          }
+      }
+  }
 }
 
 namespace {
@@ -770,7 +830,8 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
 void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                                        const uint8_t comp_mask,
                                        const double target_mul,
-                                       bool stop_early) {
+                                       bool stop_early,
+                                       const OutputImage& img2) {
   const int width = img->width();
   const int height = img->height();
   const int ncomp = jpg.components.size();
@@ -809,7 +870,8 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
       }
       block_order.clear();
       ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x,
-                               factor_y, comp_mask, img, &block_order);
+                               factor_y, comp_mask, img, img2, &block_order);
+
       candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
       for (size_t i = 0; i < block_order.size(); ++i) {
         candidate_coeffs.push_back(block_order[i].idx);
@@ -1114,12 +1176,28 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     img.CopyFromJpegData(jpg);
     img.ApplyGlobalQuantization(best_q);
 
+    OutputImage img2(jpg.width, jpg.height);
+    img2.CopyFromJpegData(jpg);
+    img2.ApplyGlobalQuantization(best_q);
+
+    for (int c = 0; c < 3; c++)
+    {
+        int size = img.component(c).pixels_size();
+        for (int k = 0; k < size; k++)
+        {
+            if (img2.component(c).pixels()[k] != img.component(c).pixels()[k])
+            {
+                LogError("fdjsalfjlkadsfdsafjdsfjdlsajdklsjf\r\n");
+            }
+        }
+    }
+
     if (!downsample) {
-      SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
+      SelectFrequencyMasking(jpg, &img, 7, 1.0, false, img2);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
-      SelectFrequencyMasking(jpg, &img, 1, ymul, false);
-      SelectFrequencyMasking(jpg, &img, 6, 1.0, true);
+      SelectFrequencyMasking(jpg, &img, 1, ymul, false, img2);
+      SelectFrequencyMasking(jpg, &img, 6, 1.0, true, img2);
     }
   }
 

From d587e6608b409311163878359fb1dde0035652cb Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 18 May 2017 20:08:37 +0800
Subject: [PATCH 086/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0factor=5Fx=20=3D=20fa?=
 =?UTF-8?q?ctor=5Fy=20=3D=202=E6=97=B6=E7=9A=84batch=E5=8C=96=E5=8E=9F?=
 =?UTF-8?q?=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli_comparator.cpp | 175 ++++++++++++++++++-----------
 1 file changed, 108 insertions(+), 67 deletions(-)

diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index deaf4cdb..672054d5 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -32,13 +32,8 @@ void IDCTToPixel8x8(const uint8_t idct[8 * 8], uint16_t pixels_[8*8])
 	}
 }
 
-void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_[16*16])
+void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_out[16*16], const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
-    const int block_x = 0;
-    const int block_y = 0;
-    const int width_  = 16;
-    const int height_ = 16;
-
     // Fill in the 10x10 pixel area in the subsampled image that will be the
     // basis of the upsampling. This area is enough to hold the 3x3 kernel of
     // the fancy upsampler around each pixel.
@@ -74,30 +69,32 @@ void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_[16*16])
                 // block by computing the inverse of the fancy upsampler.
                 const int y1 = std::max(y0 - 1, 0);
                 const int x1 = std::max(x0 - 1, 0);
-                subsampled[ix] = (pixels_[y0 * width_ + x0] * 9 +
-                    pixels_[y1 * width_ + x1] +
-                    pixels_[y0 * width_ + x1] * -3 +
-                    pixels_[y1 * width_ + x0] * -3) >> 2;
+                subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 +
+                    pixel_orig[y1 * width_ + x1] +
+                    pixel_orig[y0 * width_ + x1] * -3 +
+                    pixel_orig[y1 * width_ + x0] * -3) >> 2;
             }
         }
     }
-
-    // Determine area to update.
-    int xmin = std::max(block_x * 16 - 1, 0);
-    int xmax = std::min(block_x * 16 + 16, width_ - 1);
-    int ymin = std::max(block_y * 16 - 1, 0);
-    int ymax = std::min(block_y * 16 + 16, height_ - 1);
+	// Determine area to update.
+    int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0);
+    int xmax = std::min(block_x * 16 + 15, width_ -  1);
+    int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0);
+    int ymax = std::min(block_y * 16 + 15, height_ - 1);
 
     // Apply the fancy upsampler on the subsampled block.
     for (int y = ymin; y <= ymax; ++y) {
         const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize;
         const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize;
-        uint16_t* rowptr = &pixels_[y * width_];
         for (int x = xmin; x <= xmax; ++x) {
             const int x0 = (x & ~1) / 2 - block_x * 8 + 1;
             const int dx = (x & 1) * 2 - 1;
             const int ix = x0 + y0;
-            rowptr[x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
+
+            int out_x = x - xmin;
+            int out_y = y - ymin;
+
+            pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
                 subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4;
         }
     }
@@ -149,20 +146,20 @@ void YUVToImage(uint8_t yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize
     {
         for (int x = inside_x; x < xsize; x++)
         {
-            int idx = y * 8 + (inside_x - 1);
-            r[y * 8 + x] = r[idx];
-            g[y * 8 + x] = g[idx];
-            b[y * 8 + x] = b[idx];
+            int idx = y * xsize + (inside_x - 1);
+            r[y * xsize + x] = r[idx];
+            g[y * xsize + x] = g[idx];
+            b[y * xsize + x] = b[idx];
         }
     }
     for (int y = inside_y; y < ysize; y++)
     {
         for (int x = 0; x < xsize; x++)
         {
-            int idx = (inside_y - 1) * 8 + x;
-            r[y * 8 + x] = r[idx];
-            g[y * 8 + x] = g[idx];
-            b[y * 8 + x] = b[idx];
+            int idx = (inside_y - 1) * xsize + x;
+            r[y * xsize + x] = r[idx];
+            g[y * xsize + x] = g[idx];
+            b[y * xsize + x] = b[idx];
         }
     }
 }
@@ -217,13 +214,13 @@ void BlockToImage(const coeff_t block[8*8*3], float* r, float* g, float* b, int
     }
 }
 
-void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv)
+void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv, const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
     uint8_t idct[8 * 8];
     CoeffToIDCT(&block[0], &idct[0]);
 
     uint16_t pixels[16 * 16];
-    IDCTToPixel16x16(idct, pixels);
+    IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_);
 
     PixelToYUV(pixels, yuv, 16, 16);
 }
@@ -285,6 +282,7 @@ typedef struct __channel_info_t
     int factor;
     int block_width;
     int block_height;
+    const uint16_t *pixel;
 }channel_info;
 
 void ComputeBlockFacor(const coeff_t* candidate_block,
@@ -403,10 +401,10 @@ namespace guetzli
         }
 
         // img��ȫ���Ż����ͼ������ͨ��coeff_t���ݷ������rgb
-        int border_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8;
-        int border_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8;
+        int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8;
+        int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8;
         std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-        BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), border_x, border_y);
+        BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), inside_x, inside_y);
 /*
         {
             // ���ܻ������⣬������һ��У��
@@ -471,18 +469,18 @@ namespace guetzli
             mayout_channel[c].block_height = img.component(c).height_in_blocks();
             mayout_channel[c].block_width  = img.component(c).width_in_blocks();
             mayout_channel[c].factor       = img.component(c).factor_x();
+            mayout_channel[c].pixel =       img.component(c).pixels();
         }
 
-        uint8_t yuv16x16[3 * 16 * 16];  // factor 2 mode output image
-        uint8_t yuv8x8[3 * 8 * 8];      // factor 1 mode output image
+        uint8_t yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+        uint8_t yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
 
         // ����comp_mask��Σ�ת��ΪRGB������Ҫ��
         for (int c = 0; c < 3; c++)
         {
             if (mayout_channel[c].factor == 1) {
                 if (factor == 1) {  // channel_factor == factor ˵��Ҫ�������㣬����candidate�е�ϵ��
-                    //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x;
-                    const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8;
+                    const coeff_t * coeff_block = candidate_channel[c];
                     CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
                 }
                 else {
@@ -491,6 +489,12 @@ namespace guetzli
                             int block_xx = block_x * factor + ix;
                             int block_yy = block_y * factor + iy;
 
+                            if (ix != off_x || iy != off_y) continue;
+                            if (block_xx >= mayout_channel[c].block_width ||
+                                block_yy >= mayout_channel[c].block_height)
+                            {
+                                continue;
+                            }
                             int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
                             const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8;
                             CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
@@ -508,61 +512,98 @@ namespace guetzli
                     int ix = block_x % mayout_channel[c].factor;;
                     int iy = block_y % mayout_channel[c].factor;
 
-                    int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
-                    const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8;
-                    CoeffToYUV16x16(coeff_block, &yuv16x16[c]);
+                    int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
+                    const coeff_t * coeff_block = mayout_coeff[c] + block_16x16idx * 8 * 8;
+/*
+                    uint8_t ch[16 * 16] = { 0 };
+                    img.component(c).ToPixels(block_xx * 8, block_yy * 8, 16, 16, ch, 1);
+*/
+                    CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_xx, block_yy, img.width(), img.height());
 
                     // copy YUV16x16 corner to YUV8x8
                     Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
                 }
                 else {
-                    //int block_8x8idx = block_y * mayout_channel[c].block_width + block_x;
-                    const coeff_t * coeff_block = candidate_channel[c];//mayout_coeff[c] + block_8x8idx * 8 * 8;
-                    CoeffToYUV16x16(coeff_block, &yuv16x16[c]);
+                    const coeff_t * coeff_block = candidate_channel[c];
+                    CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_x, block_y, img.width(), img.height());
                 }
             }
         }
 
         if (factor == 1)
         {
-            int block_ix = getCurrentBlockIdx();
-            const float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
             std::vector< std::vector<float> > rgb0_c;
             int block_8x8idx = GetOrigBlock(rgb0_c, 0, 0);
+/*
+            uint8_t yuv[3 * 8 * 8];
 
-            std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-            YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data());
-
-            double err = 0;// ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
+            std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
+            {
+                int block_x = block_x_ * factor_x_ + off_x;
+                int block_y = block_y_ * factor_y_ + off_y;
+                int xmin = 8 * block_x;
+                int ymin = 8 * block_y;
 
-            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+                img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
 
-            return err;
+                img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3);
+                img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3);
+                img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3);
+            }
+*/
+            int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8;
+            int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8;
+            std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+            YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), 8, 8, inside_x, inside_y);
+/*
+            int count = 0;
+            for (int i = 0; i < 64; i++)
+            {
+                if (rgb1_c[0][i] != rgb1_c2[0][i] ||
+                    rgb1_c[1][i] != rgb1_c2[1][i] ||
+                    rgb1_c[2][i] != rgb1_c2[2][i])
+                {
+                    count++;
+                }
+            }
+            if (count > 0)
+            {
+                LogError("fdjskafjdlasfj");
+            }
+*/
+            return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
         }
         else
         {
-            float rgb16x16[3][16 * 16];
-            YUVToImage(yuv8x8, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, 16, 16);
-
-            float max_err = 0;
-           // for (int iy = 0; iy < factor; ++iy)
+            int inside_x = block_x_ * 16 + 16 > width_ ? width_ - block_x_ * 16 : 16;
+            int inside_y = block_y_ * 16 + 16 > height_ ? height_ - block_y_ * 16 : 16;
+/*
+            uint8_t yuv[3 * 8 * 8];
+            std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
             {
-                //for (int ix = 0; ix < factor; ++ix)
-                {
-                    int ix = off_x;
-                    int iy = off_y;
-                    std::vector< std::vector<float> > rgb0_c;
-                    int block_8x8idx = GetOrigBlock(rgb0_c, ix, iy);
-                    if (block_8x8idx < 0) return max_err;// continue;
+                int block_x = block_x_ * factor_x_ + off_x;
+                int block_y = block_y_ * factor_y_ + off_y;
+                int xmin = 8 * block_x;
+                int ymin = 8 * block_y;
 
-                    std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-                    Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), ix, iy);
+                img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
 
-                    float err = ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(0, 0));
-                    max_err = std::max(max_err, err);
-                }
+                img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3);
+                img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3);
+                img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3);
             }
-            return max_err;
+
+*/
+            float rgb16x16[3][16 * 16];
+            YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y);
+
+            std::vector< std::vector<float> > rgb0_c;
+            int block_8x8idx = GetOrigBlock(rgb0_c, off_x, off_y);
+
+            std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+            Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), off_x, off_y);
+
+            return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
         }
     }
 

From 8d281104038aad564d6b356e1209fa362eb6c970 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 19 May 2017 22:09:46 +0800
Subject: [PATCH 087/189] =?UTF-8?q?=E7=BF=BB=E8=AF=91ComputeBlockEx2?=
 =?UTF-8?q?=E4=B8=BAOpenCL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl             | 636 ++++++++++++++++++++++++++++-
 clguetzli/clguetzli.cpp            |  69 +++-
 clguetzli/clguetzli.h              |  24 ++
 clguetzli/clguetzli_comparator.cpp |  25 +-
 clguetzli/ocl.h                    |   3 +-
 5 files changed, 714 insertions(+), 43 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index ab595dde..55167a07 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -2020,6 +2020,25 @@ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
 	}
 }
 
+
+void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
+{
+    const int block_x = 0;
+    const int block_y = 0;
+    const int width_ = 8;
+    const int height_ = 8;
+
+    for (int iy = 0; iy < 8; ++iy) {
+        for (int ix = 0; ix < 8; ++ix) {
+            int x = 8 * block_x + ix;
+            int y = 8 * block_y + iy;
+            if (x >= width_ || y >= height_) continue;
+            int p = y * width_ + x;
+            pixels_[p] = idct[8 * iy + ix] << 4;
+        }
+    }
+}
+/*
 void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8])
 {
 	const int block_x = 0;
@@ -2039,20 +2058,89 @@ void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8])
 		}
 	}
 }
+*/
+
+
+void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+{
+    // Fill in the 10x10 pixel area in the subsampled image that will be the
+    // basis of the upsampling. This area is enough to hold the 3x3 kernel of
+    // the fancy upsampler around each pixel.
+#define  kSubsampledEdgeSize 10
+    ushort subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize];
+    for (int j = 0; j < kSubsampledEdgeSize; ++j) {
+        // The order we fill in the rows is:
+        //   8 rows intersecting the block, row below, row above
+        const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2);
+        for (int i = 0; i < kSubsampledEdgeSize; ++i) {
+            // The order we fill in each row is:
+            //   8 pixels within the block, left edge, right edge
+            const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) +
+                (i < 9 ? i + 1 : 0));
+            const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2);
+            if (x0 < 0) {
+                subsampled[ix] = subsampled[ix + 1];
+            }
+            else if (y0 < 0) {
+                subsampled[ix] = subsampled[ix + kSubsampledEdgeSize];
+            }
+            else if (x0 >= width_) {
+                subsampled[ix] = subsampled[ix - 1];
+            }
+            else if (y0 >= height_) {
+                subsampled[ix] = subsampled[ix - kSubsampledEdgeSize];
+            }
+            else if (i < 8 && j < 8) {
+                subsampled[ix] = idct[j * 8 + i] << 4;
+            }
+            else {
+                // Reconstruct the subsampled pixels around the edge of the current
+                // block by computing the inverse of the fancy upsampler.
+                const int y1 = max(y0 - 1, 0);
+                const int x1 = max(x0 - 1, 0);
+                subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 +
+                    pixel_orig[y1 * width_ + x1] +
+                    pixel_orig[y0 * width_ + x1] * -3 +
+                    pixel_orig[y1 * width_ + x0] * -3) >> 2;
+            }
+        }
+    }
+    // Determine area to update.
+    int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0);
+    int xmax = min(block_x * 16 + 15, width_ - 1);
+    int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0);
+    int ymax = min(block_y * 16 + 15, height_ - 1);
+
+    // Apply the fancy upsampler on the subsampled block.
+    for (int y = ymin; y <= ymax; ++y) {
+        const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize;
+        const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize;
+        for (int x = xmin; x <= xmax; ++x) {
+            const int x0 = (x & ~1) / 2 - block_x * 8 + 1;
+            const int dx = (x & 1) * 2 - 1;
+            const int ix = x0 + y0;
+
+            int out_x = x - xmin;
+            int out_y = y - ymin;
+
+            pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
+                subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4;
+        }
+    }
+}
 
-void PixelToYUV(const ushort pixels_[8*8], uchar out[8*8])
+// out = [YUVYUV....YUVYUV]
+void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/)
 {
-	const int stride = 3;
+    const int stride = 3;
 
-	for (int y = 0; y < 8; ++y)
-	{
-		for (int x = 0; x < 8; ++x)
-		{
-			int px = y * 8 + x;
-			*out = (uchar) ((pixels_[px] + 8 - (x & 1)) >> 4);
-			out += stride;
-		}
-	}
+    for (int y = 0; y < xsize; ++y) {
+        for (int x = 0; x < ysize; ++x) {
+            int px = y * xsize + x;
+            *out = (uchar)((pixels_[px] + 8 - (x & 1)) >> 4);
+            out += stride;
+        }
+    }
 }
 
 __constant static int kCrToRedTable[256] = {
@@ -2242,10 +2330,10 @@ __constant static uchar kRangeLimitLut[4 * 256] = {
 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 };
 
-void YUVToRGB(__private uchar pixelBlock[3*8*8])
+void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/)
 {
 	__constant uchar* kRangeLimit = kRangeLimitLut + 384;
-	for (int i = 0; i < 64; i++)
+	for (int i = 0; i < size; i++)
 	{
 		uchar *pixel = &pixelBlock[i * 3];
 
@@ -2517,8 +2605,44 @@ __constant static double kSrgb8ToLinearTable[256] = {
 	255.000000,
 };
 
+
+void YUVToImage(uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/)
+{
+    YUVToRGB(yuv, xsize * ysize);
+
+    const __constant double* lut = kSrgb8ToLinearTable;
+
+    for (int i = 0; i < xsize * ysize; i++)
+    {
+        r[i] = lut[yuv[3 * i]];
+        g[i] = lut[yuv[3 * i + 1]];
+        b[i] = lut[yuv[3 * i + 2]];
+    }
+    for (int y = 0; y < inside_y; y++)
+    {
+        for (int x = inside_x; x < xsize; x++)
+        {
+            int idx = y * xsize + (inside_x - 1);
+            r[y * xsize + x] = r[idx];
+            g[y * xsize + x] = g[idx];
+            b[y * xsize + x] = b[idx];
+        }
+    }
+    for (int y = inside_y; y < ysize; y++)
+    {
+        for (int x = 0; x < xsize; x++)
+        {
+            int idx = (inside_y - 1) * xsize + x;
+            r[y * xsize + x] = r[idx];
+            g[y * xsize + x] = g[idx];
+            b[y * xsize + x] = b[idx];
+        }
+    }
+}
+
+
 // chrisk todo
-void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8])
+void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
 {
 	uchar idct[3][8 * 8];
 	CoeffToIDCT(&block[0], &idct[0]);
@@ -2526,16 +2650,16 @@ void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], fl
 	CoeffToIDCT(&block[8 * 8 * 2], &idct[2]);
 
 	ushort pixels[3][8 * 8];
-	IDCTToPixel(&idct[0], &pixels[0]);
-	IDCTToPixel(&idct[1], &pixels[1]);
-	IDCTToPixel(&idct[2], &pixels[2]);
+	IDCTToPixel8x8(&idct[0], &pixels[0]);
+	IDCTToPixel8x8(&idct[1], &pixels[1]);
+	IDCTToPixel8x8(&idct[2], &pixels[2]);
 
 	uchar yuv[8 * 8 * 3];
-	PixelToYUV(&pixels[0], &yuv[0]);
-	PixelToYUV(&pixels[1], &yuv[1]);
-	PixelToYUV(&pixels[2], &yuv[2]);
+	PixelToYUV(&pixels[0], &yuv[0], 8, 8);
+	PixelToYUV(&pixels[1], &yuv[1], 8, 8);
+	PixelToYUV(&pixels[2], &yuv[2], 8, 8);
 
-	YUVToRGB(yuv);
+	YUVToRGB(yuv, 8 * 8);
 
 	for (int i = 0; i < 8 * 8; i++)
 	{
@@ -2543,6 +2667,110 @@ void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], fl
 		g[i] = kSrgb8ToLinearTable[yuv[3 * i + 1]];
 		b[i] = kSrgb8ToLinearTable[yuv[3 * i + 2]];
 	}
+    for (int y = 0; y < inside_y; y++)
+    {
+        for (int x = inside_x; x < 8; x++)
+        {
+            int idx = y * 8 + (inside_x - 1);
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+    for (int y = inside_y; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = (inside_y - 1) * 8 + x;
+            r[y * 8 + x] = r[idx];
+            g[y * 8 + x] = g[idx];
+            b[y * 8 + x] = b[idx];
+        }
+    }
+}
+
+void CoeffToYUV16x16(const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+{
+    uchar idct[8 * 8];
+    CoeffToIDCT(&block[0], &idct[0]);
+
+    uchar pixels[16 * 16];
+    IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_);
+
+    PixelToYUV(pixels, yuv, 16, 16);
+}
+
+void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+{
+    coeff_t b[8 * 8];
+    for (int i = 0; i < 8 * 8; i++)
+    {
+        b[i] = block[i];
+    }
+    CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_);
+}
+
+void CoeffToYUV8x8(const coeff_t block[8 * 8], uchar *yuv)
+{
+    uchar idct[8 * 8];
+    CoeffToIDCT(&block[0], &idct[0]);
+
+    ushort pixels[8 * 8];
+    IDCTToPixel8x8(idct, pixels);
+
+    PixelToYUV(pixels, yuv, 8, 8);
+}
+
+void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv)
+{
+    coeff_t b[8 * 8];
+    for (int i = 0; i < 8 * 8; i++)
+    {
+        b[i] = block[i];
+    }
+
+    CoeffToYUV8x8(b, yuv);
+}
+
+void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            yuv16x16[idx16 * 3] = yuv8x8[idx * 3];
+        }
+    }
+}
+
+void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            yuv8x8[idx * 3] = yuv16x16[idx16 * 3];
+        }
+    }
+}
+
+void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y)
+{
+    for (int y = 0; y < 8; y++)
+    {
+        for (int x = 0; x < 8; x++)
+        {
+            int idx = y * 8 + x;
+            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
+            r[idx] = rgb16x16[0][idx16];
+            g[idx] = rgb16x16[1][idx16];
+            b[idx] = rgb16x16[2][idx16];
+        }
+    }
 }
 
 void Convolution(size_t xsize, size_t ysize,
@@ -2714,6 +2942,14 @@ void floatcopy(float *dst, float *src, int size)
     }
 }
 
+void floatcopy_g(float *dst, __global float *src, int size)
+{
+    for (int i = 0; i < size; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
 void CalcOpsinDynamicsImage(ocl_channels rgb)
 {
     float rgb_blurred[3][kDCTBlockSize];
@@ -2724,6 +2960,60 @@ void CalcOpsinDynamicsImage(ocl_channels rgb)
     OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
 }
 
+void CalcOpsinDynamicsImage2(float rgb[3][kDCTBlockSize])
+{
+    float rgb_blurred[3][kDCTBlockSize];
+    for (int i = 0; i < 3; i++)
+    {
+        BlurEx(rgb[i], 8, 8, 1.1, 0, rgb_blurred[i]);
+    }
+    OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
+}
+
+double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCTBlockSize], __global float* mask_scale_block)
+{
+    CalcOpsinDynamicsImage2(rgb0_c);
+    CalcOpsinDynamicsImage2(rgb1_c);
+
+    float rgb0[3][kDCTBlockSize];
+    float rgb1[3][kDCTBlockSize];
+
+    floatcopy(rgb0, rgb0_c, 3 * kDCTBlockSize);
+    floatcopy(rgb1, rgb1_c, 3 * kDCTBlockSize);
+
+    MaskHighIntensityChangeBlock(rgb0[0], rgb0[1], rgb0[2],
+                                rgb1[0], rgb1[1], rgb1[2],
+                                rgb0_c[0], rgb0_c[1], rgb0_c[2],
+                                rgb1_c[0], rgb1_c[1], rgb1_c[2],
+                                8, 8);
+
+    // ����ΪɶҪ��floatת��double���ܼ��������㣿
+    double b0[3 * kDCTBlockSize];       //
+    double b1[3 * kDCTBlockSize];
+    for (int c = 0; c < 3; ++c) {
+        for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+            b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+            b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+        }
+    }
+
+    double diff_xyz_dc[3] = { 0.0 };
+    double diff_xyz_ac[3] = { 0.0 };
+    double diff_xyz_edge_dc[3] = { 0.0 };
+    ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+    double diff = 0.0;
+    double diff_edge = 0.0;
+
+    for (int c = 0; c < 3; ++c) {
+        diff += diff_xyz_dc[c] * mask_scale_block[c];
+        diff += diff_xyz_ac[c] * mask_scale_block[c];
+        diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c];
+    }
+    const double kEdgeWeight = 0.05;
+    return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+}
+
 // strong todo
 // candidate_block [R....R][G....G][B....B]
 // orig_image_block [RR..RRGG..GGBB..BB]
@@ -2748,7 +3038,7 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block,
         rgb1_c.r = &image_block[0];
         rgb1_c.g = &image_block[kDCTBlockSize];
         rgb1_c.b = &image_block[2 * kDCTBlockSize];
-        BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b);
+        BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b, 8, 8);
 
         CalcOpsinDynamicsImage(rgb0_c);
         CalcOpsinDynamicsImage(rgb1_c);
@@ -2870,3 +3160,305 @@ __kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch,
         }
     }
 }
+
+typedef struct __channel_info_t
+{
+    int factor;
+    int block_width;
+    int block_height;
+    __global const coeff_t *coeff;
+    __global const ushort *pixel;
+}channel_info;
+
+// return the count of Non-zero item
+int MakeInputOrderEx(coeff_t *block, coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
+{
+    int size = 0;
+    for (int c = 0; c < 3; ++c) {
+        for (int k = 1; k < block_size; ++k) {
+            int idx = c * block_size + k;
+            if (block[idx] != 0) {
+                float score = abs(orig_block[idx]) * csf[idx] + bias[idx];
+                size = list_push_back(input_order, idx, score);
+            }
+        }
+    }
+    return SortInputOrder(input_order->pData, size);
+}
+
+int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
+                 __global float *orig_image_batch,
+                 int width_,
+                 int height_,
+                 int block_x, int block_y,
+                 int factor,
+                 int off_x, int off_y)
+{
+    int block_xx = block_x * factor + off_x;
+    int block_yy = block_y * factor + off_y;
+    if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1;
+
+    const int block8_width = (width_ + 8 - 1) / 8;
+
+    int block_ix = block_yy * block8_width + block_xx;
+
+    __global const float*  block_opsin = &orig_image_batch[block_ix * 3 * kDCTBlockSize];
+    for (int i = 0; i < 3; i++) {
+        for (int k = 0; k < kDCTBlockSize; k++) {
+            rgb0_c[i][k] = block_opsin[i * kDCTBlockSize + k];
+        }
+    }
+
+    return block_ix;
+}
+
+double CompareBlockFactor(const channel_info mayout_channel[3],
+                          const coeff_t* candidate_block,
+                          const int block_x,
+                          const int block_y,
+                          __global const float *orig_image_batch,
+                          __global const float *mask_scale,
+                          const int image_width,
+                          const int image_height,
+                          const int factor)
+{
+    const coeff_t *candidate_channel[3];
+    for (int c = 0; c < 3; c++) {
+        candidate_channel[c] = &candidate_block[c * 8 * 8];
+    }
+
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
+
+    for (int c = 0; c < 3; c++)
+    {
+        if (mayout_channel[c].factor == 1) {
+            if (factor == 1) {
+                const coeff_t *coeff_block = candidate_channel[c];
+                CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
+            }
+            else {
+                for (int iy = 0; iy < factor; ++iy) {
+                    for (int ix = 0; ix < factor; ++ix) {
+                        int block_xx = block_x * factor + ix;
+                        int block_yy = block_y * factor + iy;
+
+                        ///if (ix != off_x || iy != off_y) continue;
+                        if (block_xx >= mayout_channel[c].block_width ||
+                            block_yy >= mayout_channel[c].block_height)
+                        {
+                            continue;
+                        }
+                        int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
+                        __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8;
+                        CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]);
+
+                        // copy YUV8x8 to YUV1616 corner
+                        Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy);
+                    }
+                }
+            }
+        }
+        else {
+            if (factor == 1) {
+                int block_xx = block_x / mayout_channel[c].factor;
+                int block_yy = block_y / mayout_channel[c].factor;
+                int ix = block_x % mayout_channel[c].factor;;
+                int iy = block_y % mayout_channel[c].factor;
+
+                int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
+                __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
+
+                CoeffToYUV16x16_g(coeff_block, &yuv16x16[c],
+                    mayout_channel[c].pixel, block_xx, block_yy,
+                    image_width,
+                    image_height);
+
+                // copy YUV16x16 corner to YUV8x8
+                Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
+            }
+            else {
+                const coeff_t * coeff_block = candidate_channel[c];
+                CoeffToYUV16x16(coeff_block, &yuv16x16[c],
+                    mayout_channel[c].pixel, block_x, block_y,
+                    image_width,
+                    image_height);
+            }
+        }
+    }
+
+    if (factor == 1)
+    {
+        float rgb0_c[3][kDCTBlockSize];
+        int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0);
+
+        int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8;
+        int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8;
+        float rgb1_c[3][kDCTBlockSize];
+
+        YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y);
+
+        return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+    }
+    else
+    {
+        int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16;
+        int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16;
+
+        float rgb16x16[3][16 * 16];
+        YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y);
+
+        double max_err = 0;
+        for (int iy = 0; iy < factor; ++iy) {
+            for (int ix = 0; ix < factor; ++ix) {
+                int block_xx = block_x * factor + ix;
+                int block_yy = block_y * factor + iy;
+
+                if (block_xx * 8 >= image_width ||
+                    block_yy * 8 >= image_height)
+                {
+                    continue;
+                }
+
+                float rgb0_c[3][kDCTBlockSize];
+                int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy);
+
+                float rgb1_c[3][kDCTBlockSize];
+                Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy);
+                double err = ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
+                max_err = max(max_err, err);
+            }
+        }
+        return max_err;
+    }
+}
+
+// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
+__kernel void clComputeBlockZeroingOrderFactor(
+    __global const coeff_t *orig_batch_0,       // ԭʼͼ��ϵ��
+    __global const coeff_t *orig_batch_1,       // ԭʼͼ��ϵ��
+    __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
+    __global const float   *orig_image_batch,   // ԭʼͼ��pregamma
+    __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
+    int                    image_width,
+    int                    image_height,
+    __global const coeff_t *mayout_batch_0,     // �����ѡͼ��ϵ��
+    __global const coeff_t *mayout_batch_1,     // �����ѡͼ��ϵ��
+    __global const coeff_t *mayout_batch_2,     // �����ѡͼ��ϵ��
+    __global const ushort  *mayout_pixel_0,
+    __global const ushort  *mayout_pixel_1,
+    __global const ushort  *mayout_pixel_2,
+    channel_info            mayout_channel_0,
+    channel_info            mayout_channel_1,
+    channel_info            mayout_channel_2,
+    int factor,                                 // ��ǰ���������factor
+    int comp_mask,                              // ��ǰ���������channel
+    float BlockErrorLimit,
+    __global CoeffData *output_order_list/*out*/)
+{
+    const int block_x = get_global_id(0);
+    const int block_y = get_global_id(1);
+#define kComputeBlockSize (kBlockSize * 3)
+
+    channel_info orig_channel[3];
+    orig_channel[0].coeff = orig_batch_0;
+    orig_channel[1].coeff = orig_batch_1;
+    orig_channel[2].coeff = orig_batch_2;
+
+    channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 };
+    mayout_channel[0].coeff = mayout_batch_0;
+    mayout_channel[1].coeff = mayout_batch_1;
+    mayout_channel[2].coeff = mayout_batch_2;
+    mayout_channel[0].pixel = mayout_pixel_0;
+    mayout_channel[1].pixel = mayout_pixel_1;
+    mayout_channel[2].pixel = mayout_pixel_2;
+
+    int block_idx = 0;        // ��������mask���е�channel������indx
+
+    coeff_t mayout_block[kComputeBlockSize] = { 0 };
+    coeff_t orig_block[kComputeBlockSize]   = { 0 };
+    for (int c = 0; c < 3; c++) {
+        if (comp_mask & (1<<c)) {
+            block_idx = block_y * mayout_channel[c].block_width + block_x;
+            floatcopy_g(&mayout_block[c * kBlockSize],
+                        mayout_channel[c].coeff + block_idx * kBlockSize,
+                        kBlockSize);
+
+            floatcopy_g(&orig_block[c * kBlockSize],
+                        orig_channel[c].coeff + block_idx * kBlockSize,
+                        kBlockSize);
+        }
+    }
+
+    DCTScoreData input_order_data[kComputeBlockSize];
+    CoeffData    output_order_data[kComputeBlockSize];
+
+    IntFloatPairList input_order = { 0, input_order_data };
+    IntFloatPairList output_order = { 0, output_order_data };
+
+    int count = MakeInputOrderEx(mayout_block, orig_block, &input_order, kBlockSize);
+
+    coeff_t processed_block[kComputeBlockSize];
+    for (int i = 0; i < kComputeBlockSize; i++) {
+        processed_block[i] = mayout_block[i];
+    }
+
+    while (input_order.size > 0)
+    {
+        float best_err = 1e17f;
+        int best_i = 0;
+        for (int i = 0; i < min(3, input_order.size); i++)
+        {
+            coeff_t candidate_block[kComputeBlockSize];
+            for (int i = 0; i < kComputeBlockSize; i++) {
+                candidate_block[i] = processed_block[i];
+            }
+
+            const int idx = input_order.pData[i].idx;
+
+            candidate_block[idx] = 0;
+
+            float max_err = CompareBlockFactor(mayout_channel,
+                                               candidate_block,
+                                               block_x,
+                                               block_y,
+                                               orig_image_batch,
+                                               mask_scale,
+                                               image_width,
+                                               image_height,
+                                               factor);
+            if (max_err < best_err)
+            {
+                best_err = max_err;
+                best_i = i;
+            }
+        }
+
+        int idx = input_order.pData[best_i].idx;
+        processed_block[idx] = 0;
+        list_erase(&input_order, best_i);
+
+        list_push_back(&output_order, idx, best_err);
+    }
+
+    // ע��output_order�����resize���ǰ�β������λ0
+    float min_err = 1e10;
+    for (int i = output_order.size - 1; i >= 0; --i) {
+        min_err = min(min_err, output_order.pData[i].err);
+        output_order.pData[i].err = min_err;
+    }
+
+    __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
+
+    int out_count = 0;
+    for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
+    {
+        // ���˽ϴ��err���ⲿ�ֽ����˼���û������
+        if (output_order.pData[i].err <= BlockErrorLimit)
+        {
+            output_block[out_count].idx = output_order.pData[i].idx;
+            output_block[out_count].err = output_order.pData[i].err;
+            out_count++;
+        }
+    }
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 5ab406e7..67a76300 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -63,7 +63,8 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
 	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
 	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
-    ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderFactor", &err);
 
 	return ocl;
 }
@@ -1217,7 +1218,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
     cl_float clBlockErrorLimit = BlockErrorLimit;
 
-    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZERONGORDER];
+    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch);
     clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch);
@@ -1249,5 +1250,69 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     clReleaseMemObject(mem_mask_scale_batch);
     clReleaseMemObject(mem_mayout_batch);
     clReleaseMemObject(mem_output_order_batch);
+}
+
+void clComputeBlockZeroingOrderFactor(
+    const guetzli::coeff_t *orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    int image_width,
+    int image_height,
+    const channel_info     *mayout_channel[3],
+    int factor,
+    int comp_mask,
+    int block_width,
+    int block_height,
+    float BlockErrorLimit,
+    guetzli::CoeffData *output_order_batch)
+{
+    return;
+/*
+    using namespace guetzli;
+
+    int item_count = 3 * kDCTBlockSize * size;
+
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+
+    cl_mem mem_orig_batch = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch);
+    cl_mem mem_orig_image_batch = ocl.allocMem(sizeof(float) * item_count, orig_image_batch);
+    cl_mem mem_mask_scale_batch = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch);
+    cl_mem mem_mayout_batch = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch);
+    cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
+    cl_float clBlockErrorLimit = BlockErrorLimit;
 
+    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR];
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch);
+    clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit);
+    clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch);
+
+    size_t globalWorkSize[1] = { size };
+    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    if (CL_SUCCESS != err)
+    {
+        LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+    }
+    err = clFinish(ocl.commandQueue);
+    if (CL_SUCCESS != err)
+    {
+        LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
+    }
+
+    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+    memcpy(output_order_batch, result, sizeof(CoeffData) * item_count);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    clReleaseMemObject(mem_orig_batch);
+    clReleaseMemObject(mem_orig_image_batch);
+    clReleaseMemObject(mem_mask_scale_batch);
+    clReleaseMemObject(mem_mayout_batch);
+    clReleaseMemObject(mem_output_order_batch);
+*/
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 459110de..1cd97727 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -23,6 +23,30 @@ void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch,
     float BlockErrorLimit,
     guetzli::CoeffData *output_order_batch);
 
+typedef struct __channel_info_t
+{
+    int factor;
+    int block_width;
+    int block_height;
+    const guetzli::coeff_t *coeff;
+    const uint16_t *pixel;
+}channel_info;
+
+void clComputeBlockZeroingOrderFactor(
+    const guetzli::coeff_t *orig_batch[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    int image_width,
+    int image_height,
+    const guetzli::coeff_t *mayout_batch[3],
+    const channel_info     *channel[3],
+    int factor,
+    int comp_mask,
+    int block_width,
+    int block_height,
+    float BlockErrorLimit,
+    guetzli::CoeffData *output_order_batch);
+
 void clMask(const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2,
     size_t xsize, size_t ysize,
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 672054d5..1fa6a886 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -276,26 +276,16 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float
         }
     }
 }
-
+/*
 typedef struct __channel_info_t
 {
     int factor;
     int block_width;
     int block_height;
+    const coeff_t  *coeff;
     const uint16_t *pixel;
 }channel_info;
-
-void ComputeBlockFacor(const coeff_t* candidate_block,
-                       const coeff_t * mayout_coeff[3],
-                       const channel_info mayout_channel[3],
-                       const coeff_t * orig_coeff[3],
-                       const int comp_mask,
-                       int factor
-)
-{
-
-}
-
+*/
 namespace guetzli
 {
 	ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
@@ -461,15 +451,14 @@ namespace guetzli
 
         const coeff_t *candidate_channel[3];
         channel_info mayout_channel[3];
-        const coeff_t *mayout_coeff[3];
         for (int c = 0; c < 3; c++)
         {
             candidate_channel[c] = &candidate_block[c * 8 * 8];
-            mayout_coeff[c] = img.component(c).coeffs();
             mayout_channel[c].block_height = img.component(c).height_in_blocks();
             mayout_channel[c].block_width  = img.component(c).width_in_blocks();
             mayout_channel[c].factor       = img.component(c).factor_x();
-            mayout_channel[c].pixel =       img.component(c).pixels();
+            mayout_channel[c].pixel        = img.component(c).pixels();
+            mayout_channel[c].coeff        = img.component(c).coeffs();
         }
 
         uint8_t yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
@@ -496,7 +485,7 @@ namespace guetzli
                                 continue;
                             }
                             int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
-                            const coeff_t * coeff_block = mayout_coeff[c] + block_8x8idx * 8 * 8;
+                            const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8;
                             CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
 
                             // copy YUV8x8 to YUV1616 corner
@@ -513,7 +502,7 @@ namespace guetzli
                     int iy = block_y % mayout_channel[c].factor;
 
                     int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
-                    const coeff_t * coeff_block = mayout_coeff[c] + block_16x16idx * 8 * 8;
+                    const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
 /*
                     uint8_t ch[16 * 16] = { 0 };
                     img.component(c).ToPixels(block_xx * 8, block_yy * 8, 16, 16, ch, 1);
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index bcc8ef9c..94bb88b8 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -63,7 +63,8 @@ enum KernelName {
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
 	KERNEL_EDGEDETECTORLOWFREQ,
-    KERNEL_COMPUTEBLOCKZERONGORDER,
+    KERNEL_COMPUTEBLOCKZEROINGORDER,
+    KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR,
 	KERNEL_COUNT,
 };
 

From 08db770c98a00e964b09fb0cd61ba7940254b791 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 10:36:27 +0800
Subject: [PATCH 088/189] =?UTF-8?q?clComputeBlockZeroingOrderFactor?=
 =?UTF-8?q?=E8=B0=83=E8=AF=95=20=20=20=20=20Factor=3D=3D1=E6=97=B6?=
 =?UTF-8?q?=E9=AA=8C=E8=AF=81=E9=80=9A=E8=BF=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  |  26 +++++++---
 clguetzli/clguetzli.cpp | 106 +++++++++++++++++++++++++++-------------
 clguetzli/clguetzli.h   |  17 +++----
 guetzli/processor.cc    |  71 ++++++++++++++++++---------
 4 files changed, 146 insertions(+), 74 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 55167a07..42b9c489 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1464,6 +1464,7 @@ int SortInputOrder(DCTScoreData* input_order, int size)
 	for (j = 1; j < size; j++) {
 		tmp.idx = input_order[j].idx;
 		tmp.err = input_order[j].err;
+
 		i = j - 1;
 		while (i >= 0 && input_order[i].err > tmp.err) {
 			input_order[i + 1].idx = input_order[i].idx;
@@ -2942,7 +2943,7 @@ void floatcopy(float *dst, float *src, int size)
     }
 }
 
-void floatcopy_g(float *dst, __global float *src, int size)
+void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -3171,8 +3172,9 @@ typedef struct __channel_info_t
 }channel_info;
 
 // return the count of Non-zero item
-int MakeInputOrderEx(coeff_t *block, coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
+int MakeInputOrderEx(coeff_t block[3*8*8], coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
 {
+    const int block_size = 64;
     int size = 0;
     for (int c = 0; c < 3; ++c) {
         for (int k = 1; k < block_size; ++k) {
@@ -3183,6 +3185,7 @@ int MakeInputOrderEx(coeff_t *block, coeff_t *orig_block, IntFloatPairList *inpu
             }
         }
     }
+
     return SortInputOrder(input_order->pData, size);
 }
 
@@ -3375,28 +3378,37 @@ __kernel void clComputeBlockZeroingOrderFactor(
 
     int block_idx = 0;        // ��������mask���е�channel������indx
 
-    coeff_t mayout_block[kComputeBlockSize] = { 0 };
-    coeff_t orig_block[kComputeBlockSize]   = { 0 };
+    coeff_t mayout_block[kComputeBlockSize] = { 1,20,160,78 };
+    coeff_t orig_block[kComputeBlockSize]   = { 2,190,78,78 };
+
     for (int c = 0; c < 3; c++) {
         if (comp_mask & (1<<c)) {
             block_idx = block_y * mayout_channel[c].block_width + block_x;
-            floatcopy_g(&mayout_block[c * kBlockSize],
-                        mayout_channel[c].coeff + block_idx * kBlockSize,
+            coeffcopy_g(&mayout_block[c * kBlockSize],
+                mayout_channel[c].coeff + block_idx * kBlockSize,
+                kBlockSize);
+            coeffcopy_g(&orig_block[c * kBlockSize],
+                orig_channel[c].coeff + block_idx * kBlockSize,
+                kBlockSize);
+ /*           floatcopy_g(&mayout_block[c * kBlockSize],
+                       mayout_channel[c].coeff + block_idx * kBlockSize,
                         kBlockSize);
 
             floatcopy_g(&orig_block[c * kBlockSize],
                         orig_channel[c].coeff + block_idx * kBlockSize,
                         kBlockSize);
+*/
         }
     }
 
+
     DCTScoreData input_order_data[kComputeBlockSize];
     CoeffData    output_order_data[kComputeBlockSize];
 
     IntFloatPairList input_order = { 0, input_order_data };
     IntFloatPairList output_order = { 0, output_order_data };
 
-    int count = MakeInputOrderEx(mayout_block, orig_block, &input_order, kBlockSize);
+    int count = MakeInputOrderEx(mayout_block, orig_block, &input_order);
 
     coeff_t processed_block[kComputeBlockSize];
     for (int i = 0; i < kComputeBlockSize; i++) {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 67a76300..4d132717 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1253,45 +1253,76 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 }
 
 void clComputeBlockZeroingOrderFactor(
-    const guetzli::coeff_t *orig_channel[3],
+    const channel_info orig_channel[3],
     const float *orig_image_batch,
     const float *mask_scale,
-    int image_width,
-    int image_height,
-    const channel_info     *mayout_channel[3],
-    int factor,
-    int comp_mask,
-    int block_width,
-    int block_height,
-    float BlockErrorLimit,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit,
     guetzli::CoeffData *output_order_batch)
 {
-    return;
-/*
-    using namespace guetzli;
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
 
-    int item_count = 3 * kDCTBlockSize * size;
+    using namespace guetzli;
 
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
 
-    cl_mem mem_orig_batch = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch);
-    cl_mem mem_orig_image_batch = ocl.allocMem(sizeof(float) * item_count, orig_image_batch);
-    cl_mem mem_mask_scale_batch = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch);
-    cl_mem mem_mayout_batch = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch);
-    cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
+    cl_mem mem_orig_coeff[3];
+    cl_mem mem_mayout_coeff[3];
+    cl_mem mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+
+        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+
+    }
+    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size);
     cl_float clBlockErrorLimit = BlockErrorLimit;
+    cl_int  clWidth = image_width;
+    cl_int  clHeight = image_height;
+    cl_int  clFactor = factor;
+    cl_int clMask = comp_mask;
 
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch);
-    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch);
-    clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit);
-    clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch);
-
-    size_t globalWorkSize[1] = { size };
-    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image);
+    clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale);
+    clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth);
+    clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight);
+    clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]);
+    clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]);
+    clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]);
+    clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]);
+    clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]);
+    clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]);
+    clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]);
+    clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]);
+    clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]);
+    clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor);
+    clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask);
+    clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit);
+    clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch);
+
+    size_t globalWorkSize[2] = {  blockf_width, blockf_height};
+    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
     if (CL_SUCCESS != err)
     {
         LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
@@ -1302,17 +1333,22 @@ void clComputeBlockZeroingOrderFactor(
         LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
     }
 
-    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
+    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err);
     err = clFinish(ocl.commandQueue);
-    memcpy(output_order_batch, result, sizeof(CoeffData) * item_count);
+    memcpy(output_order_batch, result, output_order_batch_size);
 
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL);
     clFinish(ocl.commandQueue);
 
-    clReleaseMemObject(mem_orig_batch);
-    clReleaseMemObject(mem_orig_image_batch);
-    clReleaseMemObject(mem_mask_scale_batch);
-    clReleaseMemObject(mem_mayout_batch);
+    for (int c = 0; c < 3; c++)
+    {
+        clReleaseMemObject(mem_orig_coeff[c]);
+        clReleaseMemObject(mem_mayout_coeff[c]);
+        clReleaseMemObject(mem_mayout_pixel[c]);
+
+    }
+
+    clReleaseMemObject(mem_orig_image);
+    clReleaseMemObject(mem_mask_scale);
     clReleaseMemObject(mem_output_order_batch);
-*/
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 1cd97727..b7479407 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -33,18 +33,15 @@ typedef struct __channel_info_t
 }channel_info;
 
 void clComputeBlockZeroingOrderFactor(
-    const guetzli::coeff_t *orig_batch[3],
+    const channel_info orig_channel[3],
     const float *orig_image_batch,
     const float *mask_scale,
-    int image_width,
-    int image_height,
-    const guetzli::coeff_t *mayout_batch[3],
-    const channel_info     *channel[3],
-    int factor,
-    int comp_mask,
-    int block_width,
-    int block_height,
-    float BlockErrorLimit,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit,
     guetzli::CoeffData *output_order_batch);
 
 void clMask(const float* r, const float* g, const float* b,
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index ffaf025d..885f9cfb 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -618,32 +618,23 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask,
                                        const double target_mul, bool stop_early)
 {
-    const int ncomp = jpg.components.size();
-    if (ncomp != 3) return;
-
-    std::vector<coeff_t> block_[3];
-    for (int c = 0; c < 3; c++)
-    {
-        int block_height = img->component(c).width_in_blocks();
-        int block_width = img->component(c).height_in_blocks();
-
-        block_[c].resize(block_height * block_width);
-    }
-
-    // we only support factor_x == factor_y == 1
     const int width = img->width();
     const int height = img->height();
-    const int factor_x = 1;
-    const int factor_y = 1;
-
+    const int ncomp = jpg.components.size();
+    const int last_c = Log2FloorNonZero(comp_mask);
+    if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
+    const int factor_x = img->component(last_c).factor_x();
+    const int factor_y = img->component(last_c).factor_y();
     const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
     const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
     const int num_blocks = block_width * block_height;
 
+
     comparator_->StartBlockComparisons(); // ��ʼ��һЩ��������Ҫ�Ƕ�ԭͼ����һЩ����
-    std::vector<coeff_t> orig_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
-    std::vector<coeff_t> mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b]
 
+//    std::vector<coeff_t> orig_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
+//    std::vector<coeff_t> mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b]
+/*
     // step 1 ��ȡ����block list
     for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
         for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
@@ -660,7 +651,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
             }
         }
     }
-
+*/
     // step 2 ��������block��ϵ��ƫ��
     std::vector<CoeffData> output_order_gpu;
     std::vector<CoeffData> output_order_cpu;
@@ -669,8 +660,40 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
 
     if (g_useOpenCL || g_checkOpenCL)
     {
+        channel_info orig_channel[3];
+        channel_info mayout_channel[3];
+
+        for (int c = 0; c < 3; c++)
+        {
+            mayout_channel[c].factor = img->component(c).factor_x();
+            mayout_channel[c].block_width = img->component(c).width_in_blocks();
+            mayout_channel[c].block_height = img->component(c).height_in_blocks();
+            mayout_channel[c].coeff = img->component(c).coeffs();
+            mayout_channel[c].pixel = img->component(c).pixels();
+
+            orig_channel[c].factor = jpg.components[c].v_samp_factor;
+            orig_channel[c].block_width = jpg.components[c].width_in_blocks;
+            orig_channel[c].block_height = jpg.components[c].height_in_blocks;
+            orig_channel[c].coeff = jpg.components[c].coeffs.data();
+        }
         output_order_gpu.resize(num_blocks * kBlockSize);
         output_order = output_order_gpu.data();
+
+        clComputeBlockZeroingOrderFactor(orig_channel,
+                                        comp->imgOpsinDynamicsBlockList.data(),
+                                        comp->imgMaskXyzScaleBlockList.data(),
+                                        width,
+                                        height,
+                                        mayout_channel,
+                                        factor_x,
+                                        comp_mask,
+                                        comp->BlockErrorLimit(),
+                                        output_order);
+
+/*
+        output_order_gpu.resize(num_blocks * kBlockSize);
+        output_order = output_order_gpu.data();
+
         clComputeBlockZeroingOrder(orig_batch.data(),
                                    comp->imgOpsinDynamicsBlockList.data(),
                                    comp->imgMaskXyzScaleBlockList.data(),
@@ -678,7 +701,10 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
                                    num_blocks,
                                    comparator_->BlockErrorLimit(),
                                    output_order_gpu.data());
+*/
+
     }
+/*
     if (!g_useOpenCL || g_checkOpenCL)
     {
         output_order_cpu.resize(num_blocks * kBlockSize);
@@ -700,6 +726,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
             }
         }
     }
+*/
     if (g_checkOpenCL)
     {
         int count = 0;
@@ -1193,11 +1220,11 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     }
 
     if (!downsample) {
-      SelectFrequencyMasking(jpg, &img, 7, 1.0, false, img2);
+      SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
-      SelectFrequencyMasking(jpg, &img, 1, ymul, false, img2);
-      SelectFrequencyMasking(jpg, &img, 6, 1.0, true, img2);
+      SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false);
+      SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true);
     }
   }
 

From d931558afce21e32e57c2707fbcdd6f3459fedaa Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 10:48:21 +0800
Subject: [PATCH 089/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 42b9c489..d2293e17 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -2951,6 +2951,14 @@ void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size)
     }
 }
 
+void coeffcopy(coeff_t *dst, coeff_t *src, int size)
+{
+    for (int i = 0; i < size; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
 void CalcOpsinDynamicsImage(ocl_channels rgb)
 {
     float rgb_blurred[3][kDCTBlockSize];
@@ -3378,8 +3386,8 @@ __kernel void clComputeBlockZeroingOrderFactor(
 
     int block_idx = 0;        // ��������mask���е�channel������indx
 
-    coeff_t mayout_block[kComputeBlockSize] = { 1,20,160,78 };
-    coeff_t orig_block[kComputeBlockSize]   = { 2,190,78,78 };
+    coeff_t mayout_block[kComputeBlockSize] = { 0 };
+    coeff_t orig_block[kComputeBlockSize]   = { 0 };
 
     for (int c = 0; c < 3; c++) {
         if (comp_mask & (1<<c)) {
@@ -3390,14 +3398,6 @@ __kernel void clComputeBlockZeroingOrderFactor(
             coeffcopy_g(&orig_block[c * kBlockSize],
                 orig_channel[c].coeff + block_idx * kBlockSize,
                 kBlockSize);
- /*           floatcopy_g(&mayout_block[c * kBlockSize],
-                       mayout_channel[c].coeff + block_idx * kBlockSize,
-                        kBlockSize);
-
-            floatcopy_g(&orig_block[c * kBlockSize],
-                        orig_channel[c].coeff + block_idx * kBlockSize,
-                        kBlockSize);
-*/
         }
     }
 
@@ -3411,9 +3411,7 @@ __kernel void clComputeBlockZeroingOrderFactor(
     int count = MakeInputOrderEx(mayout_block, orig_block, &input_order);
 
     coeff_t processed_block[kComputeBlockSize];
-    for (int i = 0; i < kComputeBlockSize; i++) {
-        processed_block[i] = mayout_block[i];
-    }
+    coeffcopy(processed_block, mayout_block, kComputeBlockSize);
 
     while (input_order.size > 0)
     {
@@ -3422,12 +3420,9 @@ __kernel void clComputeBlockZeroingOrderFactor(
         for (int i = 0; i < min(3, input_order.size); i++)
         {
             coeff_t candidate_block[kComputeBlockSize];
-            for (int i = 0; i < kComputeBlockSize; i++) {
-                candidate_block[i] = processed_block[i];
-            }
+            coeffcopy(candidate_block, processed_block, kComputeBlockSize);
 
             const int idx = input_order.pData[i].idx;
-
             candidate_block[idx] = 0;
 
             float max_err = CompareBlockFactor(mayout_channel,

From 0bda30e6dcc7d8cf8c218c2615bad51a2bc39593 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 14:09:16 +0800
Subject: [PATCH 090/189] =?UTF-8?q?factor=202=E6=94=AF=E6=8C=81=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90=20=E5=8F=91=E7=8E=B0=E4=B8=80=E4=B8=AAbug=EF=BC=8C?=
 =?UTF-8?q?=E5=A6=82=E6=9E=9C=E6=9C=80=E5=90=8E=E4=B8=80=E4=B8=AA=E5=9D=97?=
 =?UTF-8?q?=E8=AE=A1=E7=AE=97=E5=87=BA=E6=9D=A5err=E7=B3=BB=E6=95=B0?=
 =?UTF-8?q?=E4=B8=AA=E6=95=B0=E4=B8=BA0=E7=9A=84=E8=AF=9D=EF=BC=8CBackEnd?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=E4=BC=9Acrash?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl             | 64 ++++++++++++-------------
 clguetzli/clguetzli_comparator.cpp |  3 +-
 guetzli/processor.cc               | 77 +++++++++++-------------------
 3 files changed, 62 insertions(+), 82 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index d2293e17..32d0864d 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1869,7 +1869,7 @@ __constant static float bias[192] = {
 
 // chrisk todo
 // return the count of Non-zero item
-int MakeInputOrder(__global coeff_t *block, __global coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
+int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
 {
 	int size = 0;
 	for (int c = 0; c < 3; ++c) {
@@ -2607,7 +2607,7 @@ __constant static double kSrgb8ToLinearTable[256] = {
 };
 
 
-void YUVToImage(uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/)
+void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/)
 {
     YUVToRGB(yuv, xsize * ysize);
 
@@ -2646,19 +2646,19 @@ void YUVToImage(uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/*
 void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
 {
 	uchar idct[3][8 * 8];
-	CoeffToIDCT(&block[0], &idct[0]);
-	CoeffToIDCT(&block[8 * 8], &idct[1]);
-	CoeffToIDCT(&block[8 * 8 * 2], &idct[2]);
+	CoeffToIDCT(&block[0], idct[0]);
+	CoeffToIDCT(&block[8 * 8], idct[1]);
+	CoeffToIDCT(&block[8 * 8 * 2], idct[2]);
 
 	ushort pixels[3][8 * 8];
-	IDCTToPixel8x8(&idct[0], &pixels[0]);
-	IDCTToPixel8x8(&idct[1], &pixels[1]);
-	IDCTToPixel8x8(&idct[2], &pixels[2]);
+	IDCTToPixel8x8(idct[0], pixels[0]);
+	IDCTToPixel8x8(idct[1], pixels[1]);
+	IDCTToPixel8x8(idct[2], pixels[2]);
 
 	uchar yuv[8 * 8 * 3];
-	PixelToYUV(&pixels[0], &yuv[0], 8, 8);
-	PixelToYUV(&pixels[1], &yuv[1], 8, 8);
-	PixelToYUV(&pixels[2], &yuv[2], 8, 8);
+	PixelToYUV(pixels[0], &yuv[0], 8, 8);
+	PixelToYUV(pixels[1], &yuv[1], 8, 8);
+	PixelToYUV(pixels[2], &yuv[2], 8, 8);
 
 	YUVToRGB(yuv, 8 * 8);
 
@@ -2690,12 +2690,12 @@ void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], fl
     }
 }
 
-void CoeffToYUV16x16(const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
     uchar idct[8 * 8];
     CoeffToIDCT(&block[0], &idct[0]);
 
-    uchar pixels[16 * 16];
+    ushort pixels[16 * 16];
     IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_);
 
     PixelToYUV(pixels, yuv, 16, 16);
@@ -2711,7 +2711,7 @@ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global
     CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_);
 }
 
-void CoeffToYUV8x8(const coeff_t block[8 * 8], uchar *yuv)
+void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv)
 {
     uchar idct[8 * 8];
     CoeffToIDCT(&block[0], &idct[0]);
@@ -2834,8 +2834,8 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio,
 
 
 // ian todo
-void OpsinDynamicsImageBlock(float *r, float *g, float *b,
-                            float *r_blurred, float *g_blurred, float *b_blurred,
+void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b,
+                            __private float *r_blurred, __private float *g_blurred, __private float *b_blurred,
                             int size)
 {
   for (size_t i = 0; i < size; ++i) {
@@ -2935,7 +2935,7 @@ typedef union ocl_channels_t
     float *ch[3];
 }ocl_channels;
 
-void floatcopy(float *dst, float *src, int size)
+void floatcopy(float *dst, const float *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -2943,7 +2943,7 @@ void floatcopy(float *dst, float *src, int size)
     }
 }
 
-void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size)
+void coeffcopy_g(coeff_t *dst, const __global coeff_t *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -2951,7 +2951,7 @@ void coeffcopy_g(coeff_t *dst, __global coeff_t *src, int size)
     }
 }
 
-void coeffcopy(coeff_t *dst, coeff_t *src, int size)
+void coeffcopy(coeff_t *dst, const coeff_t *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -2969,7 +2969,7 @@ void CalcOpsinDynamicsImage(ocl_channels rgb)
     OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
 }
 
-void CalcOpsinDynamicsImage2(float rgb[3][kDCTBlockSize])
+void CalcOpsinDynamicsImage2(__private float rgb[3][kDCTBlockSize])
 {
     float rgb_blurred[3][kDCTBlockSize];
     for (int i = 0; i < 3; i++)
@@ -2979,7 +2979,7 @@ void CalcOpsinDynamicsImage2(float rgb[3][kDCTBlockSize])
     OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
 }
 
-double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCTBlockSize], __global float* mask_scale_block)
+double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
 {
     CalcOpsinDynamicsImage2(rgb0_c);
     CalcOpsinDynamicsImage2(rgb1_c);
@@ -2987,8 +2987,8 @@ double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCT
     float rgb0[3][kDCTBlockSize];
     float rgb1[3][kDCTBlockSize];
 
-    floatcopy(rgb0, rgb0_c, 3 * kDCTBlockSize);
-    floatcopy(rgb1, rgb1_c, 3 * kDCTBlockSize);
+    floatcopy(&rgb0[0][0], &rgb0_c[0][0], 3 * kDCTBlockSize);
+    floatcopy(&rgb1[0][0], &rgb1_c[0][0], 3 * kDCTBlockSize);
 
     MaskHighIntensityChangeBlock(rgb0[0], rgb0[1], rgb0[2],
                                 rgb1[0], rgb1[1], rgb1[2],
@@ -3027,7 +3027,7 @@ double ComputeImage8x8Block(float rgb0_c[3][kDCTBlockSize], float rgb1_c[3][kDCT
 // candidate_block [R....R][G....G][B....B]
 // orig_image_block [RR..RRGG..GGBB..BB]
 // mask_scale[RGB]
-float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block, __global float* mask_scale_block)
+float CompareBlockEx(coeff_t *candidate_block, __global const float* orig_image_block, __global const float* mask_scale_block)
 {
     float rgb0[3][kDCTBlockSize];
     float rgb1[3][kDCTBlockSize];
@@ -3052,8 +3052,8 @@ float CompareBlockEx(coeff_t *candidate_block, __global float* orig_image_block,
         CalcOpsinDynamicsImage(rgb0_c);
         CalcOpsinDynamicsImage(rgb1_c);
 
-        floatcopy(rgb0, rgb0_data, 3 * kDCTBlockSize);
-        floatcopy(rgb1, image_block, 3 * kDCTBlockSize);
+        floatcopy(&rgb0[0][0], rgb0_data, 3 * kDCTBlockSize);
+        floatcopy(&rgb1[0][0], image_block, 3 * kDCTBlockSize);
 
         MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2],
                                      rgb1[0], rgb1[1], rgb1[2],
@@ -3102,9 +3102,9 @@ __kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch,
     int block_idx = get_global_id(0);
 #define kComputeBlockSize (kBlockSize * 3)
 
-    __global coeff_t *orig_block       = orig_batch + block_idx * kComputeBlockSize;
-    __global coeff_t *mayout_block     = mayout_batch + block_idx * kComputeBlockSize;
-    __global float   *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize;
+    __global const coeff_t *orig_block       = orig_batch + block_idx * kComputeBlockSize;
+    __global const coeff_t *mayout_block     = mayout_batch + block_idx * kComputeBlockSize;
+    __global const float   *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize;
 
     DCTScoreData input_order_data[kComputeBlockSize];
     CoeffData    output_order_data[kComputeBlockSize];
@@ -3180,7 +3180,7 @@ typedef struct __channel_info_t
 }channel_info;
 
 // return the count of Non-zero item
-int MakeInputOrderEx(coeff_t block[3*8*8], coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
+int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
 {
     const int block_size = 64;
     int size = 0;
@@ -3198,7 +3198,7 @@ int MakeInputOrderEx(coeff_t block[3*8*8], coeff_t orig_block[3*8*8], IntFloatPa
 }
 
 int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
-                 __global float *orig_image_batch,
+                 const __global float *orig_image_batch,
                  int width_,
                  int height_,
                  int block_x, int block_y,
@@ -3336,7 +3336,7 @@ double CompareBlockFactor(const channel_info mayout_channel[3],
 
                 float rgb1_c[3][kDCTBlockSize];
                 Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy);
-                double err = ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
+                double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
                 max_err = max(max_err, err);
             }
         }
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 1fa6a886..08890a46 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -362,7 +362,8 @@ namespace guetzli
 
     double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
-        double err = CompareBlockEx2(img, off_x, off_y, candidate_block, comp_mask);
+        double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+        return err;
         if (g_checkOpenCL)
         {
             double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 885f9cfb..88079303 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -55,7 +55,7 @@ class Processor {
 
   void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
-                              bool stop_early);
+                              bool stop_early, const OutputImage &img2);
 
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
@@ -74,9 +74,10 @@ class Processor {
       const int block_x, const int block_y, const int factor_x,
       const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2,
       std::vector<CoeffData>* output_order);
-
+  /*
   void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
       const int block_x, const int block_y, std::vector<CoeffData>* output_order);
+  */
 
   bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
                          int best_q[3][kDCTBlockSize],
@@ -616,7 +617,7 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 }  // namespace
 
 void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask,
-                                       const double target_mul, bool stop_early)
+                                       const double target_mul, bool stop_early, const OutputImage &img2)
 {
     const int width = img->width();
     const int height = img->height();
@@ -632,27 +633,6 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
 
     comparator_->StartBlockComparisons(); // ��ʼ��һЩ��������Ҫ�Ƕ�ԭͼ����һЩ����
 
-//    std::vector<coeff_t> orig_batch(num_blocks * kBlockSize);   // [block_r block_g block_b]
-//    std::vector<coeff_t> mayout_batch(num_blocks * kBlockSize); // [block_r block_g block_b]
-/*
-    // step 1 ��ȡ����block list
-    for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
-        for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-            coeff_t *orig_block   = &orig_batch[block_ix * kBlockSize];
-            coeff_t *mayout_block = &mayout_batch[block_ix * kBlockSize];
-
-            for (int c = 0; c < 3; ++c)
-            {
-                img->component(c).GetCoeffBlock(block_x, block_y, &mayout_block[c * kDCTBlockSize]);
-
-                const JPEGComponent& comp = jpg.components[c];
-                int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
-                memcpy(&orig_block[c * kDCTBlockSize], &comp.coeffs[jpg_block_ix * kDCTBlockSize], kDCTBlockSize * sizeof(orig_block[0])); // TOBEREMOVE:ȡ��ԭʼͼ��blockϵ��
-            }
-        }
-    }
-*/
-    // step 2 ��������block��ϵ��ƫ��
     std::vector<CoeffData> output_order_gpu;
     std::vector<CoeffData> output_order_cpu;
     CoeffData * output_order = NULL;
@@ -690,32 +670,31 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
                                         comp->BlockErrorLimit(),
                                         output_order);
 
-/*
-        output_order_gpu.resize(num_blocks * kBlockSize);
-        output_order = output_order_gpu.data();
-
-        clComputeBlockZeroingOrder(orig_batch.data(),
-                                   comp->imgOpsinDynamicsBlockList.data(),
-                                   comp->imgMaskXyzScaleBlockList.data(),
-                                   mayout_batch.data(),
-                                   num_blocks,
-                                   comparator_->BlockErrorLimit(),
-                                   output_order_gpu.data());
-*/
-
     }
-/*
     if (!g_useOpenCL || g_checkOpenCL)
     {
         output_order_cpu.resize(num_blocks * kBlockSize);
         output_order = output_order_cpu.data();
         for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
             for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-                coeff_t *orig_block = &orig_batch[block_ix * kBlockSize];
-                coeff_t *block      = &mayout_batch[block_ix * kBlockSize];
+                coeff_t block[kBlockSize] = { 0 };
+                coeff_t orig_block[kBlockSize] = { 0 };
+                for (int c = 0; c < 3; ++c) {
+                    if (comp_mask & (1 << c)) {
+                        assert(img->component(c).factor_x() == factor_x);
+                        assert(img->component(c).factor_y() == factor_y);
+                        img->component(c).GetCoeffBlock(block_x, block_y,
+                            &block[c * kDCTBlockSize]);
+                        const JPEGComponent& comp = jpg.components[c];
+                        int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+                        memcpy(&orig_block[c * kDCTBlockSize],
+                            &comp.coeffs[jpg_block_ix * kDCTBlockSize],
+                            kDCTBlockSize * sizeof(orig_block[0]));
+                    }
+                }
 
                 std::vector<CoeffData> block_order;
-                ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, &block_order);
+                ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, img2, &block_order);
 
                 CoeffData * p = &output_order_cpu[block_ix * kBlockSize];
                 for (int i = 0; i < block_order.size(); i++)
@@ -726,7 +705,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
             }
         }
     }
-*/
+
     if (g_checkOpenCL)
     {
         int count = 0;
@@ -769,11 +748,11 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
     comparator_->FinishBlockComparisons();
     candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
 
-    SelectFrequencyBackEnd(jpg, img, 7, target_mul, stop_early,
+    SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early,
         candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
 
 }
-
+/*
 void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
     const int block_x, const int block_y, std::vector<CoeffData>* output_order)
 {
@@ -824,7 +803,7 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
 
             candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
 
-            float max_err = 0;/// ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block);
+            float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block);
             if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
                 best_err = max_err;
                 best_i = i;
@@ -853,7 +832,7 @@ void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const
     }
     output_order->resize(num);
 }
-
+*/
 void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                                        const uint8_t comp_mask,
                                        const double target_mul,
@@ -1220,11 +1199,11 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     }
 
     if (!downsample) {
-      SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false);
+      SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false, img2);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
-      SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false);
-      SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true);
+      SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false, img2);
+      SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true, img2);
     }
   }
 

From 999585d2eeaabb43ebc18838797e19d183e6022c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 17:18:05 +0800
Subject: [PATCH 091/189] =?UTF-8?q?=E5=90=88=E5=B9=B6=E7=B1=BB=E5=9E=8B?=
 =?UTF-8?q?=E5=A3=B0=E6=98=8E=EF=BC=8C=E5=9C=A8opencl=E4=B8=ADinclude=20Op?=
 =?UTF-8?q?enCL=E4=BB=A3=E7=A0=81=E5=90=88=E5=B9=B6=E5=85=A5C=E7=AB=AF?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BB=A3=E7=A0=81=E4=BE=9B=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=E8=B0=83=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl             | 139 ++++++++++++-----------------
 clguetzli/clguetzli.cpp            |   8 +-
 clguetzli/clguetzli.h              |  15 +---
 clguetzli/clguetzli_comparator.cpp |  11 +--
 clguetzli/ocl.h                    |  20 +----
 guetzli.vcxproj                    |   3 +-
 guetzli.vcxproj.filters            |   4 +-
 7 files changed, 70 insertions(+), 130 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 32d0864d..7da649e6 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,10 +1,6 @@
-//#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-//#elif defined(cl_amd_fp64)
-//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
-//#else
-//#error "Double precision floating point not supported by OpenCL implementation."
-//#endif
+
+#include  "clguetzli\clguetzli.cl.h"
 
 #define kBlockEdge 8
 #define kBlockSize (kBlockEdge * kBlockEdge)
@@ -244,7 +240,7 @@ __kernel void clScaleImage(double scale, __global float *result)
     result[i] *= scale;
 }
 
-kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out)
+__kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -255,7 +251,7 @@ kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __gl
     out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
-kernel void clAddBorder(__global float *out, int s, int s2, __global float *in)
+__kernel void clAddBorder(__global float *out, int s, int s2, __global float *in)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -1427,8 +1423,6 @@ typedef struct __IntFloatPair
     float err;
 }IntFloatPair, DCTScoreData, CoeffData;
 
-typedef short coeff_t;
-
 typedef struct __IntFloatPairList
 {
     int size;
@@ -2643,7 +2637,7 @@ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, in
 
 
 // chrisk todo
-void BlockToImage(__private coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
+void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
 {
 	uchar idct[3][8 * 8];
 	CoeffToIDCT(&block[0], idct[0]);
@@ -2870,70 +2864,58 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     float *c1_x, float *c1_y, float *c1_b,
     int xsize, int ysize)
 {
-	for (int x = 0; x < xsize; ++x)
-	{
-		for (int y = 0; y < ysize; ++y)
-		{
-			size_t ix = y * xsize + x;
-			const double ave[3] = {
-				(c0_x[ix] + c1_x[ix]) * 0.5,
-				(c0_y[ix] + c1_y[ix]) * 0.5,
-				(c0_b[ix] + c1_b[ix]) * 0.5,
-			};
-			double sqr_max_diff = -1;
-			{
-				int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
-				int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
-				for (int dir = 0; dir < 4; ++dir) {
-					if (border[dir])
-					{
-						continue;
-					}
-					const int ix2 = ix + offset[dir];
-					double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
-					diff *= diff;
-					if (sqr_max_diff < diff)
-					{
-						sqr_max_diff = diff;
-					}
-				}
-			}
-			const double kReductionX = 275.19165240059317;
-			const double kReductionY = 18599.41286306991;
-			const double kReductionZ = 410.8995306951065;
-			const double kChromaBalance = 106.95800948271017;
-			double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
-
-			const double mix[3] = {
-				chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
-				kReductionY / (sqr_max_diff + kReductionY),
-				chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
-			};
-			// Interpolate lineraly between the average color and the actual
-			// color -- to reduce the importance of this pixel.
-			xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
-			xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
-
-			xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
-			xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
-
-			xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
-			xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
-		}
-	}
-}
-
-typedef union ocl_channels_t
-{
-    struct
+    for (int x = 0; x < xsize; ++x)
     {
-        float * r;
-        float * g;
-        float * b;
-    };
-
-    float *ch[3];
-}ocl_channels;
+        for (int y = 0; y < ysize; ++y)
+        {
+            size_t ix = y * xsize + x;
+            const double ave[3] = {
+                (c0_x[ix] + c1_x[ix]) * 0.5,
+                (c0_y[ix] + c1_y[ix]) * 0.5,
+                (c0_b[ix] + c1_b[ix]) * 0.5,
+            };
+            double sqr_max_diff = -1;
+            {
+                int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+                int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+                for (int dir = 0; dir < 4; ++dir) {
+                    if (border[dir])
+                    {
+                        continue;
+                    }
+                    const int ix2 = ix + offset[dir];
+                    double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+                    diff *= diff;
+                    if (sqr_max_diff < diff)
+                    {
+                        sqr_max_diff = diff;
+                    }
+                }
+            }
+            const double kReductionX = 275.19165240059317;
+            const double kReductionY = 18599.41286306991;
+            const double kReductionZ = 410.8995306951065;
+            const double kChromaBalance = 106.95800948271017;
+            double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+            const double mix[3] = {
+                chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+                kReductionY / (sqr_max_diff + kReductionY),
+                chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+            };
+            // Interpolate lineraly between the average color and the actual
+            // color -- to reduce the importance of this pixel.
+            xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+            xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+
+            xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+            xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+
+            xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+            xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
+        }
+    }
+}
 
 void floatcopy(float *dst, const float *src, int size)
 {
@@ -3170,15 +3152,6 @@ __kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch,
     }
 }
 
-typedef struct __channel_info_t
-{
-    int factor;
-    int block_width;
-    int block_height;
-    __global const coeff_t *coeff;
-    __global const ushort *pixel;
-}channel_info;
-
 // return the count of Non-zero item
 int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
 {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 4d132717..ec6e70fb 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1211,10 +1211,10 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
 
-    cl_mem mem_orig_batch         = ocl.allocMem(sizeof(coeff_t) * item_count, orig_batch);
+    cl_mem mem_orig_batch         = ocl.allocMem(sizeof(::coeff_t) * item_count, orig_batch);
     cl_mem mem_orig_image_batch   = ocl.allocMem(sizeof(float) * item_count, orig_image_batch);
     cl_mem mem_mask_scale_batch   = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch);
-    cl_mem mem_mayout_batch       = ocl.allocMem(sizeof(coeff_t) * item_count, mayout_batch);
+    cl_mem mem_mayout_batch       = ocl.allocMem(sizeof(::coeff_t) * item_count, mayout_batch);
     cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
     cl_float clBlockErrorLimit = BlockErrorLimit;
 
@@ -1280,10 +1280,10 @@ void clComputeBlockZeroingOrderFactor(
     for (int c = 0; c < 3; c++)
     {
         int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
-        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
 
         block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
-        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
 
         mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
 
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index b7479407..9be1ac10 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -4,6 +4,8 @@
 #include "guetzli\processor.h"
 #include "ocl.h"
 
+#include "clguetzli.cl.h"
+
 extern bool g_useOpenCL;
 extern bool g_checkOpenCL;
 
@@ -15,23 +17,14 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
-void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch,
+void clComputeBlockZeroingOrder(const coeff_t *orig_batch,
     const float *orig_image_batch,
     const float* orig_mask_scale_batch,
-    const guetzli::coeff_t *mayout_batch,
+    const coeff_t *mayout_batch,
     int size,
     float BlockErrorLimit,
     guetzli::CoeffData *output_order_batch);
 
-typedef struct __channel_info_t
-{
-    int factor;
-    int block_width;
-    int block_height;
-    const guetzli::coeff_t *coeff;
-    const uint16_t *pixel;
-}channel_info;
-
 void clComputeBlockZeroingOrderFactor(
     const channel_info orig_channel[3],
     const float *orig_image_batch,
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
index 08890a46..b003ecb8 100644
--- a/clguetzli/clguetzli_comparator.cpp
+++ b/clguetzli/clguetzli_comparator.cpp
@@ -276,16 +276,7 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float
         }
     }
 }
-/*
-typedef struct __channel_info_t
-{
-    int factor;
-    int block_width;
-    int block_height;
-    const coeff_t  *coeff;
-    const uint16_t *pixel;
-}channel_info;
-*/
+
 namespace guetzli
 {
 	ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 94bb88b8..0b8df8b4 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -2,6 +2,7 @@
 
 #include "CL\cl.h"
 #include "utils.h"
+#include "clguetzli.cl.h"
 
 // Macros for OpenCL versions
 #define OPENCL_VERSION_1_2  1.2f
@@ -68,25 +69,6 @@ enum KernelName {
 	KERNEL_COUNT,
 };
 
-typedef union ocl_channels_t
-{
-    struct
-    {
-        cl_mem r;
-        cl_mem g;
-        cl_mem b;
-    };
-
-	struct
-	{
-		cl_mem x;
-		cl_mem y;
-		cl_mem b;
-	};
-
-    cl_mem ch[3];
-}ocl_channels;
-
 struct ocl_args_d_t
 {
 	ocl_args_d_t();
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 3ae4554f..c914d909 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -291,8 +291,8 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="clguetzli\clbutter_comparator.cpp" />
+    <ClCompile Include="clguetzli\clguetzli.cl.cpp" />
     <ClCompile Include="clguetzli\clguetzli.cpp" />
-    <ClCompile Include="clguetzli\clguetzli_comparator.cpp" />
     <ClCompile Include="clguetzli\clguetzli_test.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
     <ClCompile Include="clguetzli\utils.cpp" />
@@ -385,6 +385,7 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
+      <Include Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">"C:\Users\strongtu\Documents\Project\git_strong\guetzli\clguetzli";%(Include)</Include>
     </Intel_OpenCL_Build_Rules>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 9b0a7ad0..98009b47 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -575,10 +575,10 @@
     <ClCompile Include="clguetzli\clguetzli_test.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
-    <ClCompile Include="clguetzli\clbutter_comparator.cpp">
+    <ClCompile Include="clguetzli\clguetzli.cl.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
-    <ClCompile Include="clguetzli\clguetzli_comparator.cpp">
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
   </ItemGroup>

From 643e8dbfcd263d79d6e0085892050abb1d5c09db Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 17:22:35 +0800
Subject: [PATCH 092/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20clEnqueueUnmapMemO?=
 =?UTF-8?q?bject=20=E5=8F=82=E6=95=B0=E4=BC=A0=E9=80=92bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp      | 10 +++---
 clguetzli/clguetzli_test.cpp | 60 ++++++++++++++++++------------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index ec6e70fb..b5d7c4a2 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -385,9 +385,9 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
 	memcpy(g, result_g, channel_size);
 	memcpy(b, result_b, channel_size);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL);
 	clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
@@ -1182,7 +1182,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	err = clFinish(ocl.commandQueue);
 	memcpy(result, result_r, channel_size);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL);
 	clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb1);
@@ -1242,7 +1242,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     err = clFinish(ocl.commandQueue);
     memcpy(output_order_batch, result, sizeof(CoeffData) * item_count);
 
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, sizeof(CoeffData) * item_count, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL);
     clFinish(ocl.commandQueue);
 
     clReleaseMemObject(mem_orig_batch);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 38e3e966..6dca483f 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -62,12 +62,12 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	FLOAT_COMPARE(result_g2, r1_g, xsize * ysize);
 	FLOAT_COMPARE(result_b2, r1_b, xsize * ysize);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.r, r0_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.g, r0_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb0.b, r0_b, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.r, r1_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.g, r1_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, xyb1.b, r1_b, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
@@ -106,7 +106,7 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 
 	FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, edgemap_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
@@ -150,8 +150,8 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	FLOAT_COMPARE(r_dc, result_diff_dc, res_xsize * res_ysize * 3);
 	FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, reschannel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_dc, r_dc, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
@@ -197,7 +197,7 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 
 	FLOAT_COMPARE(r_ac, result_diff_ac, res_xsize * res_ysize * 3);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, reschannel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, block_diff_ac, r_ac, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(xyb0);
@@ -246,12 +246,12 @@ void tclMask(const float* r, const float* g, const float* b,
 	FLOAT_COMPARE(maskdc_g, r1_g, xsize * ysize);
 	FLOAT_COMPARE(maskdc_b, r1_b, xsize * ysize);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.r, r0_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.g, r0_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask.b, r0_b, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.r, r1_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.g, r1_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, mask_dc.b, r1_b, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(rgb);
@@ -299,7 +299,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 
 	FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize);
 
-  clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, res_xsize * res_ysize * sizeof(float), NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL);
 	ocl.releaseMemChannels(mask);
 	ocl.releaseMemChannels(mask_dc);
 	clReleaseMemObject(cl_block_diff_dc);
@@ -324,7 +324,7 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	//cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err);
   //err = clFinish(ocl.commandQueue);
 	//FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize);
-  //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, length, NULL, NULL);
+  //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL);
 	clReleaseMemObject(mem_diffmap);
 }
 
@@ -346,7 +346,7 @@ void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double bo
 
     FLOAT_COMPARE(result, r_r, xsize * ysize);
 
-    clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, channel_size, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
     err = clFinish(ocl.commandQueue);
 
     clReleaseMemObject(r);
@@ -383,7 +383,7 @@ void tclConvolution(size_t xsize, size_t ysize,
 
 	FLOAT_COMPARE(result, r_r, dxsize * ysize);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	clReleaseMemObject(i);
@@ -415,7 +415,7 @@ void tclUpsample(float* image, size_t xsize, size_t ysize,
 
 	FLOAT_COMPARE(result, r_r, xsize * ysize);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, result_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	clReleaseMemObject(img);
@@ -457,9 +457,9 @@ void tclDiffPrecompute(
   ocl.releaseMemChannels(cl_xyb0);
   ocl.releaseMemChannels(cl_xyb1);
   ocl.releaseMemChannels(cl_mask);
-  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, channel_size, NULL, NULL);
-  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, channel_size, NULL, NULL);
-  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, channel_size, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, 0, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, 0, NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, 0, NULL, NULL);
 }
 
 // ian todo
@@ -474,7 +474,7 @@ void tclAverage5x5(int xsize, int ysize, std::vector<float> &diffs_org, std::vec
   err = clFinish(ocl.commandQueue);
   FLOAT_COMPARE(r, diffs_cmp.data(), xsize * ysize);
 
-  clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, xsize * ysize * sizeof(float), NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, mem_diff, r, 0, NULL, NULL);
   clReleaseMemObject(mem_diff);
 }
 
@@ -498,7 +498,7 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset,
 
 	FLOAT_COMPARE(values, r_r, xsize * ysize);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, img_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	clReleaseMemObject(r);
@@ -517,7 +517,7 @@ void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t le
 
   FLOAT_COMPARE(r_r, result_cmp, length);
 
-  clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, length * sizeof(float), NULL, NULL);
+  clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL);
   clReleaseMemObject(mem_result_org);
 }
 
@@ -546,9 +546,9 @@ void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ys
 	FLOAT_COMPARE(result_g, r_g, xsize * ysize);
 	FLOAT_COMPARE(result_b, r_b, xsize * ysize);
 
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, channel_size, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, channel_size, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, r_r, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, r_g, 0, NULL, NULL);
+	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, r_b, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(rgb);

From b2d8639423665e7e4081cc2fee9a253b1a614dae Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 22:10:04 +0800
Subject: [PATCH 093/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl.cpp       | 145 +++++++++++++++++++++++++++++++
 clguetzli/clguetzli_comparator.h |  11 +--
 2 files changed, 146 insertions(+), 10 deletions(-)
 create mode 100644 clguetzli/clguetzli.cl.cpp

diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
new file mode 100644
index 00000000..fb994d7c
--- /dev/null
+++ b/clguetzli/clguetzli.cl.cpp
@@ -0,0 +1,145 @@
+#include <algorithm>
+#include <stdint.h>
+#include "utils.h"
+#include "clguetzli_comparator.h"
+
+extern bool g_useOpenCL;
+extern bool g_checkOpenCL;
+
+using namespace std;
+
+int get_global_id(int dim)
+{
+    return 0;
+}
+
+int get_global_size(int dim)
+{
+    return 0;
+}
+
+#define abs(exper)    fabs((exper))
+
+#define __opencl
+#include "clguetzli.cl"
+
+
+namespace guetzli
+{
+    ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
+        const std::vector<uint8_t>* rgb,
+        const float target_distance, ProcessStats* stats)
+        : ButteraugliComparator(width, height, rgb, target_distance, stats)
+    {
+
+    }
+
+    void ButteraugliComparatorEx::StartBlockComparisons()
+    {
+        ButteraugliComparator::StartBlockComparisons();
+
+        const int width = width_;
+        const int height = height_;
+        const int factor_x = 1;
+        const int factor_y = 1;
+
+        const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+        const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+        const int num_blocks = block_width * block_height;
+
+        const double* lut = kSrgb8ToLinearTable;
+
+        imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize);
+        imgMaskXyzScaleBlockList.resize(num_blocks * 3);
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y)
+        {
+            for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix)
+            {
+                float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
+                float* curG = curR + kDCTBlockSize;
+                float* curB = curG + kDCTBlockSize;
+
+                for (int iy = 0, i = 0; iy < 8; ++iy) {
+                    for (int ix = 0; ix < 8; ++ix, ++i) {
+                        int x = std::min(8 * block_x + ix, width - 1);
+                        int y = std::min(8 * block_y + iy, height - 1);
+                        int px = y * width + x;
+
+                        curR[i] = lut[rgb_orig_[3 * px]];
+                        curG[i] = lut[rgb_orig_[3 * px + 1]];
+                        curB[i] = lut[rgb_orig_[3 * px + 2]];
+                    }
+                }
+
+                int xmin = block_x * 8;
+                int ymin = block_y * 8;
+
+                imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin];
+                imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin];
+                imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin];
+            }
+        }
+    }
+
+    void ButteraugliComparatorEx::FinishBlockComparisons() {
+        ButteraugliComparator::FinishBlockComparisons();
+
+        imgOpsinDynamicsBlockList.clear();
+        imgMaskXyzScaleBlockList.clear();
+    }
+
+    void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
+    {
+        block_x_ = block_x;
+        block_y_ = block_y;
+        factor_x_ = factor_x;
+        factor_y_ = factor_y;
+
+        ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
+    }
+
+    double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
+    {
+        double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+        return err;
+        if (g_checkOpenCL)
+        {
+            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+            if (err1 != err)
+            {
+                LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__);
+            }
+        }
+
+        return err;
+    }
+
+    double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
+    {
+        const int block_x = block_x_;
+        const int block_y = block_y_;
+        const int factor = factor_x_;
+
+        const coeff_t *candidate_channel[3];
+        channel_info mayout_channel[3];
+        for (int c = 0; c < 3; c++)
+        {
+            candidate_channel[c] = &candidate_block[c * 8 * 8];
+            mayout_channel[c].block_height = img.component(c).height_in_blocks();
+            mayout_channel[c].block_width = img.component(c).width_in_blocks();
+            mayout_channel[c].factor = img.component(c).factor_x();
+            mayout_channel[c].pixel = img.component(c).pixels();
+            mayout_channel[c].coeff = img.component(c).coeffs();
+        }
+
+        return CompareBlockFactor(mayout_channel,
+            candidate_block,
+            block_x,
+            block_y,
+            imgOpsinDynamicsBlockList.data(),
+            imgMaskXyzScaleBlockList.data(),
+            width_,
+            height_,
+            factor_x_);
+    }
+}
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
index 840254a7..721fcb32 100644
--- a/clguetzli/clguetzli_comparator.h
+++ b/clguetzli/clguetzli_comparator.h
@@ -16,16 +16,7 @@ namespace guetzli {
 		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
 
         double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
-		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const;
-        double CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const;
-    private:
-        int    GetOrigBlock(std::vector< std::vector<float> > &rgb0_c, int off_x, int off_y) const;
-        double ComputeImage8x8Block(std::vector<std::vector<float> > &rgb0_c,
-                                    std::vector<std::vector<float> > &rgb1_c,
-                                    int block_8x8idx) const;
-
-        int getCurrentBlockIdx(void) const;
-        int getCurrentBlock8x8Idx(int off_x, int off_y) const;
+		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const;
 	public:
 		std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount

From 5a546242b50ab07ac8a90c80569e31ff6301b824 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 20 May 2017 23:42:52 +0800
Subject: [PATCH 094/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl             | 171 +-------
 clguetzli/clguetzli.cl.cpp         |  44 +-
 clguetzli/clguetzli.cl.h           |  84 ++++
 clguetzli/clguetzli.cpp            |   1 -
 clguetzli/clguetzli.h              |  27 +-
 clguetzli/clguetzli_comparator.cpp | 643 -----------------------------
 clguetzli/clguetzli_comparator.h   |  25 --
 clguetzli/ocl.h                    |   1 -
 8 files changed, 134 insertions(+), 862 deletions(-)
 create mode 100644 clguetzli/clguetzli.cl.h
 delete mode 100644 clguetzli/clguetzli_comparator.cpp
 delete mode 100644 clguetzli/clguetzli_comparator.h

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 7da649e6..793e0f0b 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -2,11 +2,11 @@
 
 #include  "clguetzli\clguetzli.cl.h"
 
-#define kBlockEdge 8
-#define kBlockSize (kBlockEdge * kBlockEdge)
-#define kDCTBlockSize (kBlockEdge * kBlockEdge)
+#define kBlockEdge      8
+#define kBlockSize      (kBlockEdge * kBlockEdge)
+#define kDCTBlockSize   (kBlockEdge * kBlockEdge)
 #define kBlockEdgeHalf  (kBlockEdge / 2)
-#define kBlockHalf (kBlockEdge * kBlockEdgeHalf)
+#define kBlockHalf      (kBlockEdge * kBlockEdgeHalf)
 
 void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
 double InterpolateClampNegative(__global const double *array, int size, double sx);
@@ -2941,17 +2941,7 @@ void coeffcopy(coeff_t *dst, const coeff_t *src, int size)
     }
 }
 
-void CalcOpsinDynamicsImage(ocl_channels rgb)
-{
-    float rgb_blurred[3][kDCTBlockSize];
-    for (int i = 0; i < 3; i++)
-    {
-        BlurEx(rgb.ch[i], 8, 8, 1.1, 0, rgb_blurred[i]);
-    }
-    OpsinDynamicsImageBlock(rgb.r, rgb.g, rgb.b, rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
-}
-
-void CalcOpsinDynamicsImage2(__private float rgb[3][kDCTBlockSize])
+void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
 {
     float rgb_blurred[3][kDCTBlockSize];
     for (int i = 0; i < 3; i++)
@@ -2963,8 +2953,8 @@ void CalcOpsinDynamicsImage2(__private float rgb[3][kDCTBlockSize])
 
 double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
 {
-    CalcOpsinDynamicsImage2(rgb0_c);
-    CalcOpsinDynamicsImage2(rgb1_c);
+    CalcOpsinDynamicsImage(rgb0_c);
+    CalcOpsinDynamicsImage(rgb1_c);
 
     float rgb0[3][kDCTBlockSize];
     float rgb1[3][kDCTBlockSize];
@@ -3005,153 +2995,6 @@ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private
     return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
 }
 
-// strong todo
-// candidate_block [R....R][G....G][B....B]
-// orig_image_block [RR..RRGG..GGBB..BB]
-// mask_scale[RGB]
-float CompareBlockEx(coeff_t *candidate_block, __global const float* orig_image_block, __global const float* mask_scale_block)
-{
-    float rgb0[3][kDCTBlockSize];
-    float rgb1[3][kDCTBlockSize];
-    {
-        float rgb0_data[3*kDCTBlockSize];
-        ocl_channels rgb0_c;
-        rgb0_c.r = &rgb0_data[0];
-        rgb0_c.g = &rgb0_data[kDCTBlockSize];
-        rgb0_c.b = &rgb0_data[2 * kDCTBlockSize];
-        for (int i = 0; i < 3*kDCTBlockSize; i++)
-        {
-            rgb0_data[i] = orig_image_block[i];
-        }
-
-        float image_block[3 * kDCTBlockSize];
-        ocl_channels rgb1_c;
-        rgb1_c.r = &image_block[0];
-        rgb1_c.g = &image_block[kDCTBlockSize];
-        rgb1_c.b = &image_block[2 * kDCTBlockSize];
-        BlockToImage(candidate_block, rgb1_c.r, rgb1_c.g, rgb1_c.b, 8, 8);
-
-        CalcOpsinDynamicsImage(rgb0_c);
-        CalcOpsinDynamicsImage(rgb1_c);
-
-        floatcopy(&rgb0[0][0], rgb0_data, 3 * kDCTBlockSize);
-        floatcopy(&rgb1[0][0], image_block, 3 * kDCTBlockSize);
-
-        MaskHighIntensityChangeBlock(rgb0[0],rgb0[1], rgb0[2],
-                                     rgb1[0], rgb1[1], rgb1[2],
-                                     rgb0_c.ch[0], rgb0_c.ch[1], rgb0_c.ch[2],
-                                     rgb1_c.ch[0], rgb1_c.ch[1], rgb1_c.ch[2],
-                                     8, 8);
-
-    }
-
-    // ����ΪɶҪ��floatת��double���ܼ��������㣿
-    double b0[3 * kDCTBlockSize];       //
-    double b1[3 * kDCTBlockSize];
-    for (int c = 0; c < 3; ++c) {
-        for (int ix = 0; ix < kDCTBlockSize; ++ix) {
-            b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
-            b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
-        }
-    }
-
-    double diff_xyz_dc[3] = { 0.0 };
-    double diff_xyz_ac[3] = { 0.0 };
-    double diff_xyz_edge_dc[3] = { 0.0 };
-    ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
-
-    double diff = 0.0;
-    double diff_edge = 0.0;
-
-    for (int c = 0; c < 3; ++c) {
-        diff += diff_xyz_dc[c] * mask_scale_block[c];
-        diff += diff_xyz_ac[c] * mask_scale_block[c];
-        diff_edge += diff_xyz_edge_dc[c] * mask_scale_block[c];
-    }
-    const double kEdgeWeight = 0.05;
-    return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
-}
-
-// strong todo
-// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
-__kernel void clComputeBlockZeroingOrder(__global const coeff_t *orig_batch,         // ԭʼͼ��ϵ��
-                                         __global const float   *orig_image_batch,   // ԭʼͼ��pregamma��
-                                         __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
-                                         __global const coeff_t *mayout_batch,       // �����ѡͼ��ϵ��
-                                         float BlockErrorLimit,
-                                         __global CoeffData *output_order_list/*out*/)
-{
-    int block_idx = get_global_id(0);
-#define kComputeBlockSize (kBlockSize * 3)
-
-    __global const coeff_t *orig_block       = orig_batch + block_idx * kComputeBlockSize;
-    __global const coeff_t *mayout_block     = mayout_batch + block_idx * kComputeBlockSize;
-    __global const float   *orig_image_block = orig_image_batch + block_idx * kComputeBlockSize;
-
-    DCTScoreData input_order_data[kComputeBlockSize];
-    CoeffData    output_order_data[kComputeBlockSize];
-
-    IntFloatPairList input_order  = { 0, input_order_data };
-    IntFloatPairList output_order = { 0, output_order_data };
-
-    int count = MakeInputOrder(mayout_block, orig_block, &input_order, kBlockSize);
-
-    coeff_t processed_block[kComputeBlockSize];
-    for (int i = 0; i < kComputeBlockSize; i++) {
-        processed_block[i] = mayout_block[i];
-    }
-
-    while (input_order.size > 0)
-    {
-        float best_err = 1e17f;
-        int best_i = 0;
-        for (int i = 0; i < min(3, input_order.size); i++)
-        {
-            coeff_t candidate_block[kComputeBlockSize];
-            for (int i = 0; i < kComputeBlockSize; i++) {
-                candidate_block[i] = processed_block[i];
-            }
-
-            const int idx = input_order.pData[i].idx;
-
-            candidate_block[idx] = 0;
-
-            float max_err = CompareBlockEx(candidate_block, orig_image_block, mask_scale + block_idx * 3);
-            if (max_err < best_err)
-            {
-                best_err = max_err;
-                best_i = i;
-            }
-        }
-
-        int idx = input_order.pData[best_i].idx;
-        processed_block[idx] = 0;
-        list_erase(&input_order, best_i);
-
-        list_push_back(&output_order, idx, best_err);
-    }
-    // ע��output_order�����resize���ǰ�β������λ0
-    float min_err = 1e10;
-    for (int i = output_order.size - 1; i >= 0; --i) {
-        min_err = min(min_err, output_order.pData[i].err);
-        output_order.pData[i].err = min_err;
-    }
-
-    __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
-
-    int out_count = 0;
-    for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
-    {
-        // ���˽ϴ��err���ⲿ�ֽ����˼���û������
-        if (output_order.pData[i].err <= BlockErrorLimit)
-        {
-            output_block[out_count].idx = output_order.pData[i].idx;
-            output_block[out_count].err = output_order.pData[i].err;
-            out_count++;
-        }
-    }
-}
-
 // return the count of Non-zero item
 int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
 {
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index fb994d7c..9e6f7b87 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -1,29 +1,32 @@
 #include <algorithm>
 #include <stdint.h>
+#include <vector>
 #include "utils.h"
-#include "clguetzli_comparator.h"
-
-extern bool g_useOpenCL;
-extern bool g_checkOpenCL;
 
 using namespace std;
 
-int get_global_id(int dim)
-{
-    return 0;
-}
+int g_idvec[10] = { 0 };
+int g_sizevec[10] = { 0 };
 
-int get_global_size(int dim)
-{
-    return 0;
+int get_global_id(int dim) {
+    return g_idvec[dim];
+}
+int get_global_size(int dim) {
+    return g_sizevec[dim];
 }
 
-#define abs(exper)    fabs((exper))
+void set_global_id(int dim, int id){
+    g_idvec[dim] = id;
+}
+void set_global_size(int dim, int size){
+    g_sizevec[dim] = size;
+}
 
 #define __opencl
+#define abs(exper)    fabs((exper))
+#include "clguetzli.h"
 #include "clguetzli.cl"
 
-
 namespace guetzli
 {
     ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
@@ -90,11 +93,6 @@ namespace guetzli
 
     void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
     {
-        block_x_ = block_x;
-        block_y_ = block_y;
-        factor_x_ = factor_x;
-        factor_y_ = factor_y;
-
         ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
     }
 
@@ -116,15 +114,9 @@ namespace guetzli
 
     double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
-        const int block_x = block_x_;
-        const int block_y = block_y_;
-        const int factor = factor_x_;
-
-        const coeff_t *candidate_channel[3];
         channel_info mayout_channel[3];
         for (int c = 0; c < 3; c++)
         {
-            candidate_channel[c] = &candidate_block[c * 8 * 8];
             mayout_channel[c].block_height = img.component(c).height_in_blocks();
             mayout_channel[c].block_width = img.component(c).width_in_blocks();
             mayout_channel[c].factor = img.component(c).factor_x();
@@ -134,8 +126,8 @@ namespace guetzli
 
         return CompareBlockFactor(mayout_channel,
             candidate_block,
-            block_x,
-            block_y,
+            block_x_,
+            block_y_,
             imgOpsinDynamicsBlockList.data(),
             imgMaskXyzScaleBlockList.data(),
             width_,
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
new file mode 100644
index 00000000..53a89eef
--- /dev/null
+++ b/clguetzli/clguetzli.cl.h
@@ -0,0 +1,84 @@
+#ifndef __CLGUETZLI_CL_H__
+#define __CLGUETZLI_CL_H__
+
+#ifdef __cplusplus
+
+#define __kernel
+#define __private
+#define __global
+#define __constant
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+int get_global_id(int dim);
+int get_global_size(int dim);
+void set_global_id(int dim, int id);
+void set_global_size(int dim, int size);
+
+#ifdef __opencl
+typedef union ocl_channels_t
+{
+    struct
+    {
+        float * r;
+        float * g;
+        float * b;
+    };
+    union
+    {
+        float *ch[3];
+    };
+}ocl_channels;
+#else
+typedef union ocl_channels_t
+{
+    struct
+    {
+        cl_mem r;
+        cl_mem g;
+        cl_mem b;
+    };
+    struct
+    {
+        cl_mem x;
+        cl_mem y;
+        cl_mem b;
+    };
+    union
+    {
+        cl_mem ch[3];
+    };
+}ocl_channels;
+
+#endif
+
+#else
+typedef union ocl_channels_t
+{
+    struct
+    {
+        float * r;
+        float * g;
+        float * b;
+    };
+
+    union
+    {
+        float *ch[3];
+    };
+}ocl_channels;
+
+#endif
+
+typedef short coeff_t;
+
+typedef struct __channel_info_t
+{
+    int factor;
+    int block_width;
+    int block_height;
+    __global const coeff_t *coeff;
+    __global const ushort  *pixel;
+}channel_info;
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index b5d7c4a2..d04e8c1c 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -63,7 +63,6 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
 	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
 	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
-    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err);
     ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderFactor", &err);
 
 	return ocl;
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 9be1ac10..3a20eaa1 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,9 +1,9 @@
 #pragma once
+#include <vector>
 #include "CL\cl.h"
-#include "guetzli\jpeg_data.h"
 #include "guetzli\processor.h"
+#include "guetzli\butteraugli_comparator.h"
 #include "ocl.h"
-
 #include "clguetzli.cl.h"
 
 extern bool g_useOpenCL;
@@ -94,3 +94,26 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
 void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/);
 
 void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize);
+
+class guetzli::OutputImage;
+
+namespace guetzli {
+
+    class ButteraugliComparatorEx : public ButteraugliComparator
+    {
+    public:
+        ButteraugliComparatorEx(const int width, const int height,
+            const std::vector<uint8_t>* rgb,
+            const float target_distance, ProcessStats* stats);
+
+        void StartBlockComparisons() override;
+        void FinishBlockComparisons() override;
+        void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
+
+        double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
+        double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const;
+    public:
+        std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
+        std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
+    };
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli_comparator.cpp b/clguetzli/clguetzli_comparator.cpp
deleted file mode 100644
index b003ecb8..00000000
--- a/clguetzli/clguetzli_comparator.cpp
+++ /dev/null
@@ -1,643 +0,0 @@
-#include <stdint.h>
-#include <algorithm>
-#include "clguetzli_comparator.h"
-#include "guetzli\idct.h"
-#include "guetzli\color_transform.h"
-#include "guetzli\gamma_correct.h"
-#include "clguetzli\ocl.h"
-#include "clguetzli\clguetzli.h"
-
-using namespace guetzli;
-
-void CoeffToIDCT(const coeff_t block[8*8], uint8_t idct[8*8])
-{
-	guetzli::ComputeBlockIDCT(block, idct);
-}
-
-void IDCTToPixel8x8(const uint8_t idct[8 * 8], uint16_t pixels_[8*8])
-{
-	const int block_x = 0;
-	const int block_y = 0;
-	const int width_ = 8;
-	const int height_ = 8;
-
-	for (int iy = 0; iy < 8; ++iy) {
-		for (int ix = 0; ix < 8; ++ix) {
-			int x = 8 * block_x + ix;
-			int y = 8 * block_y + iy;
-			if (x >= width_ || y >= height_) continue;
-			int p = y * width_ + x;
-			pixels_[p] = idct[8 * iy + ix] << 4;
-		}
-	}
-}
-
-void IDCTToPixel16x16(const uint8_t idct[8*8], uint16_t pixels_out[16*16], const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_)
-{
-    // Fill in the 10x10 pixel area in the subsampled image that will be the
-    // basis of the upsampling. This area is enough to hold the 3x3 kernel of
-    // the fancy upsampler around each pixel.
-    static const int kSubsampledEdgeSize = 10;
-    uint16_t subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize];
-    for (int j = 0; j < kSubsampledEdgeSize; ++j) {
-        // The order we fill in the rows is:
-        //   8 rows intersecting the block, row below, row above
-        const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2);
-        for (int i = 0; i < kSubsampledEdgeSize; ++i) {
-            // The order we fill in each row is:
-            //   8 pixels within the block, left edge, right edge
-            const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) +
-                (i < 9 ? i + 1 : 0));
-            const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2);
-            if (x0 < 0) {
-                subsampled[ix] = subsampled[ix + 1];
-            }
-            else if (y0 < 0) {
-                subsampled[ix] = subsampled[ix + kSubsampledEdgeSize];
-            }
-            else if (x0 >= width_) {
-                subsampled[ix] = subsampled[ix - 1];
-            }
-            else if (y0 >= height_) {
-                subsampled[ix] = subsampled[ix - kSubsampledEdgeSize];
-            }
-            else if (i < 8 && j < 8) {
-                subsampled[ix] = idct[j * 8 + i] << 4;
-            }
-            else {
-                // Reconstruct the subsampled pixels around the edge of the current
-                // block by computing the inverse of the fancy upsampler.
-                const int y1 = std::max(y0 - 1, 0);
-                const int x1 = std::max(x0 - 1, 0);
-                subsampled[ix] = (pixel_orig[y0 * width_ + x0] * 9 +
-                    pixel_orig[y1 * width_ + x1] +
-                    pixel_orig[y0 * width_ + x1] * -3 +
-                    pixel_orig[y1 * width_ + x0] * -3) >> 2;
-            }
-        }
-    }
-	// Determine area to update.
-    int xmin = block_x * 16; // std::max(block_x * 16 - 1, 0);
-    int xmax = std::min(block_x * 16 + 15, width_ -  1);
-    int ymin = block_y * 16; // std::max(block_y * 16 - 1, 0);
-    int ymax = std::min(block_y * 16 + 15, height_ - 1);
-
-    // Apply the fancy upsampler on the subsampled block.
-    for (int y = ymin; y <= ymax; ++y) {
-        const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize;
-        const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize;
-        for (int x = xmin; x <= xmax; ++x) {
-            const int x0 = (x & ~1) / 2 - block_x * 8 + 1;
-            const int dx = (x & 1) * 2 - 1;
-            const int ix = x0 + y0;
-
-            int out_x = x - xmin;
-            int out_y = y - ymin;
-
-            pixels_out[out_y * 16 + out_x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
-                subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4;
-        }
-    }
-}
-
-// out = [YUVYUV....YUVYUV]
-void PixelToYUV(uint16_t pixels_[8*8], uint8_t out[8*8], int xsize = 8, int ysize = 8)
-{
-	const int stride = 3;
-
-	for (int y = 0; y < xsize; ++y) {
-		for (int x = 0; x < ysize; ++x) {
-            int px = y * xsize + x;
-			*out = static_cast<uint8_t>((pixels_[px] + 8 - (x & 1)) >> 4);
-            out += stride;
-		}
-	}
-}
-
-// pixel = [YUVYUV...YUVYUV] to [RGBRGB...RGBRGB]
-void YUVToRGB(uint8_t pixelBlock[3*8*8], int size = 8 * 8)
-{
-	for (int i = 0; i < size; i++)
-	{
-		uint8_t *pixel = &pixelBlock[i*3];
-
-		int y = pixel[0];
-		int cb = pixel[1];
-		int cr = pixel[2];
-		pixel[0] = kRangeLimit[y + kCrToRedTable[cr]];
-		pixel[1] = kRangeLimit[y + ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)];
-		pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]];
-	}
-}
-
-void YUVToImage(uint8_t yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize = 8, int ysize = 8, int inside_x = 8, int inside_y = 8)
-{
-    YUVToRGB(yuv, xsize * ysize);
-
-    const double* lut = Srgb8ToLinearTable();
-
-    for (int i = 0; i < xsize * ysize; i++)
-    {
-        r[i] = lut[yuv[3 * i]];
-        g[i] = lut[yuv[3 * i + 1]];
-        b[i] = lut[yuv[3 * i + 2]];
-    }
-    for (int y = 0; y < inside_y; y++)
-    {
-        for (int x = inside_x; x < xsize; x++)
-        {
-            int idx = y * xsize + (inside_x - 1);
-            r[y * xsize + x] = r[idx];
-            g[y * xsize + x] = g[idx];
-            b[y * xsize + x] = b[idx];
-        }
-    }
-    for (int y = inside_y; y < ysize; y++)
-    {
-        for (int x = 0; x < xsize; x++)
-        {
-            int idx = (inside_y - 1) * xsize + x;
-            r[y * xsize + x] = r[idx];
-            g[y * xsize + x] = g[idx];
-            b[y * xsize + x] = b[idx];
-        }
-    }
-}
-
-// block = [R....R][G....G][B.....]
-void BlockToImage(const coeff_t block[8*8*3], float* r, float* g, float* b, int inside_x, int inside_y)
-{
-	uint8_t idct[3][8 * 8];
-	CoeffToIDCT(&block[0], idct[0]);
-	CoeffToIDCT(&block[8 * 8], idct[1]);
-	CoeffToIDCT(&block[8 * 8 * 2], idct[2]);
-
-    uint16_t pixels[3][8 * 8];
-	IDCTToPixel8x8(idct[0], pixels[0]);
-	IDCTToPixel8x8(idct[1], pixels[1]);
-	IDCTToPixel8x8(idct[2], pixels[2]);
-
-	uint8_t yuv[8 * 8 * 3];
-	PixelToYUV(pixels[0], &yuv[0]);
-	PixelToYUV(pixels[1], &yuv[1]);
-	PixelToYUV(pixels[2], &yuv[2]);
-
-    YUVToRGB(yuv);
-
-	const double* lut = Srgb8ToLinearTable();
-
-	for (int i = 0; i < 8 * 8; i++)
-	{
-		r[i] = lut[yuv[3 * i]];
-		g[i] = lut[yuv[3 * i + 1]];
-		b[i] = lut[yuv[3 * i + 2]];
-	}
-    for (int y = 0; y < inside_y; y++)
-    {
-        for (int x = inside_x; x < 8; x++)
-        {
-            int idx = y * 8 + (inside_x - 1);
-            r[y * 8 + x] = r[idx];
-            g[y * 8 + x] = g[idx];
-            b[y * 8 + x] = b[idx];
-        }
-    }
-    for (int y = inside_y; y < 8; y++)
-    {
-        for (int x = 0; x < 8; x++)
-        {
-            int idx = (inside_y - 1) * 8 + x;
-            r[y * 8 + x] = r[idx];
-            g[y * 8 + x] = g[idx];
-            b[y * 8 + x] = b[idx];
-        }
-    }
-}
-
-void CoeffToYUV16x16(const coeff_t block[8 * 8], uint8_t *yuv, const uint16_t *pixel_orig, int block_x, int block_y, int width_, int height_)
-{
-    uint8_t idct[8 * 8];
-    CoeffToIDCT(&block[0], &idct[0]);
-
-    uint16_t pixels[16 * 16];
-    IDCTToPixel16x16(idct, pixels, pixel_orig, block_x, block_y, width_, height_);
-
-    PixelToYUV(pixels, yuv, 16, 16);
-}
-
-void CoeffToYUV8x8(const coeff_t block[8 * 8], uint8_t *yuv)
-{
-    uint8_t idct[8 * 8];
-    CoeffToIDCT(&block[0], &idct[0]);
-
-    uint16_t pixels[8 * 8];
-    IDCTToPixel8x8(idct, pixels);
-
-    PixelToYUV(pixels, yuv);
-}
-
-void Copy8x8To16x16(const uint8_t yuv8x8[3 * 8 * 8], uint8_t yuv16x16[3 * 16 * 16], int off_x, int off_y)
-{
-    for (int y = 0; y < 8; y++)
-    {
-        for (int x = 0; x < 8; x++)
-        {
-            int idx = y * 8 + x;
-            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
-            yuv16x16[idx16 * 3] = yuv8x8[idx * 3];
-        }
-    }
-}
-
-void Copy16x16To8x8(const uint8_t yuv16x16[3 * 16 * 16], uint8_t yuv8x8[3 * 8 * 8], int off_x, int off_y)
-{
-    for (int y = 0; y < 8; y++)
-    {
-        for (int x = 0; x < 8; x++)
-        {
-            int idx = y * 8 + x;
-            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
-            yuv8x8[idx * 3] = yuv16x16[idx16 * 3];
-        }
-    }
-}
-
-void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y)
-{
-    for (int y = 0; y < 8; y++)
-    {
-        for (int x = 0; x < 8; x++)
-        {
-            int idx = y * 8 + x;
-            int idx16 = (y + off_y * 8) * 16 + (x + off_x * 8);
-            r[idx] = rgb16x16[0][idx16];
-            g[idx] = rgb16x16[1][idx16];
-            b[idx] = rgb16x16[2][idx16];
-        }
-    }
-}
-
-namespace guetzli
-{
-	ButteraugliComparatorEx::ButteraugliComparatorEx(const int width, const int height,
-		const std::vector<uint8_t>* rgb,
-		const float target_distance, ProcessStats* stats)
-		: ButteraugliComparator(width, height, rgb, target_distance, stats)
-	{
-
-	}
-
-	void ButteraugliComparatorEx::StartBlockComparisons()
-	{
-		ButteraugliComparator::StartBlockComparisons();
-
-        const int width = width_;
-        const int height = height_;
-        const int factor_x = 1;
-        const int factor_y = 1;
-
-        const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
-        const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
-        const int num_blocks = block_width * block_height;
-
-        const double* lut = Srgb8ToLinearTable();
-
-        imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize);
-        imgMaskXyzScaleBlockList.resize(num_blocks * 3);
-        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y)
-        {
-            for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix)
-            {
-                float* curR = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
-                float* curG = curR + kDCTBlockSize;
-                float* curB = curG + kDCTBlockSize;
-
-                for (int iy = 0, i = 0; iy < 8; ++iy) {
-                    for (int ix = 0; ix < 8; ++ix, ++i) {
-                        int x = std::min(8 * block_x + ix, width - 1);
-                        int y = std::min(8 * block_y + iy, height - 1);
-                        int px = y * width + x;
-
-                        curR[i] = lut[rgb_orig_[3 * px]];
-                        curG[i] = lut[rgb_orig_[3 * px + 1]];
-                        curB[i] = lut[rgb_orig_[3 * px + 2]];
-                    }
-                }
-
-                int xmin = block_x * 8;
-                int ymin = block_y * 8;
-
-                imgMaskXyzScaleBlockList[block_ix * 3] = mask_xyz_[0][ymin * width_ + xmin];
-                imgMaskXyzScaleBlockList[block_ix * 3 + 1] = mask_xyz_[1][ymin * width_ + xmin];
-                imgMaskXyzScaleBlockList[block_ix * 3 + 2] = mask_xyz_[2][ymin * width_ + xmin];
-            }
-        }
-    }
-
-    void ButteraugliComparatorEx::FinishBlockComparisons() {
-        ButteraugliComparator::FinishBlockComparisons();
-
-        imgOpsinDynamicsBlockList.clear();
-        imgMaskXyzScaleBlockList.clear();
-    }
-
-    void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
-    {
-        block_x_ = block_x;
-        block_y_ = block_y;
-        factor_x_ = factor_x;
-        factor_y_ = factor_y;
-
-        ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
-    }
-
-    double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
-    {
-        double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
-        return err;
-        if (g_checkOpenCL)
-        {
-            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
-            if (err1 != err)
-            {
-                LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__);
-            }
-        }
-
-        return err;
-    }
-
-    double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block) const
-    {
-        int block_ix = getCurrentBlockIdx();
-
-        const float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
-
-        // �����ԭʼͼ��
-        std::vector< std::vector<float> > rgb0_c;
-        rgb0_c.resize(3);
-        for (int i = 0; i < 3; i++)
-        {
-            rgb0_c[i].resize(kDCTBlockSize);
-            memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float));
-        }
-
-        // img��ȫ���Ż����ͼ������ͨ��coeff_t���ݷ������rgb
-        int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8;
-        int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8;
-        std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-        BlockToImage(candidate_block, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), inside_x, inside_y);
-/*
-        {
-            // ���ܻ������⣬������һ��У��
-            int block_x = block_x_ * factor_x_ + off_x;
-            int block_y = block_y_ * factor_y_ + off_y;
-            int xmin = 8 * block_x;
-            int ymin = 8 * block_y;
-
-            std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
-            img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
-
-            for (int i = 0; i < 3; i++)
-            {
-                for (int k = 0; k < 64; k++)
-                {
-                    if (fabs(rgb1_c[i][k] - rgb1_c2[i][k]) > 0.001)
-                    {
-                        LogError("Error: CompareBlock misstake.\n");
-                    }
-                }
-            }
-        }
-*/
-        // �����Ǽ��㹤��
-        return ComputeImage8x8Block(rgb0_c, rgb1_c, getCurrentBlock8x8Idx(off_x, off_y));
-	}
-
-    int ButteraugliComparatorEx::GetOrigBlock(std::vector< std::vector<float> > &rgb0_c, int off_x, int off_y) const
-    {
-        int block_xx = block_x_ * factor_x_ + off_x;
-        int block_yy = block_y_ * factor_y_ + off_y;
-        if (block_xx * 8 >= width_ || block_yy * 8 >= height_) return -1;
-
-        const int block8_width = (width_ + 8 - 1) / 8;
-
-        int block_ix = block_yy * block8_width + block_xx;
-
-        rgb0_c.resize(3);
-        const float*  block_opsin = &imgOpsinDynamicsBlockList[block_ix * 3 * kDCTBlockSize];
-        for (int i = 0; i < 3; i++)
-        {
-            rgb0_c[i].resize(kDCTBlockSize);
-            memcpy(rgb0_c[i].data(), block_opsin + i * kDCTBlockSize, kDCTBlockSize * sizeof(float));
-        }
-
-        return block_ix;
-    }
-
-    double ButteraugliComparatorEx::CompareBlockEx2(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
-    {
-        const int block_x = block_x_;
-        const int block_y = block_y_;
-        const int factor = factor_x_;
-
-        const coeff_t *candidate_channel[3];
-        channel_info mayout_channel[3];
-        for (int c = 0; c < 3; c++)
-        {
-            candidate_channel[c] = &candidate_block[c * 8 * 8];
-            mayout_channel[c].block_height = img.component(c).height_in_blocks();
-            mayout_channel[c].block_width  = img.component(c).width_in_blocks();
-            mayout_channel[c].factor       = img.component(c).factor_x();
-            mayout_channel[c].pixel        = img.component(c).pixels();
-            mayout_channel[c].coeff        = img.component(c).coeffs();
-        }
-
-        uint8_t yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
-        uint8_t yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
-
-        // ����comp_mask��Σ�ת��ΪRGB������Ҫ��
-        for (int c = 0; c < 3; c++)
-        {
-            if (mayout_channel[c].factor == 1) {
-                if (factor == 1) {  // channel_factor == factor ˵��Ҫ�������㣬����candidate�е�ϵ��
-                    const coeff_t * coeff_block = candidate_channel[c];
-                    CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
-                }
-                else {
-                    for (int iy = 0; iy < factor; ++iy) {
-                        for (int ix = 0; ix < factor; ++ix) {
-                            int block_xx = block_x * factor + ix;
-                            int block_yy = block_y * factor + iy;
-
-                            if (ix != off_x || iy != off_y) continue;
-                            if (block_xx >= mayout_channel[c].block_width ||
-                                block_yy >= mayout_channel[c].block_height)
-                            {
-                                continue;
-                            }
-                            int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
-                            const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8;
-                            CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
-
-                            // copy YUV8x8 to YUV1616 corner
-                            Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy);
-                        }
-                    }
-                }
-            }
-            else {
-                if (factor == 1) {
-                    int block_xx = block_x / mayout_channel[c].factor;
-                    int block_yy = block_y / mayout_channel[c].factor;
-                    int ix = block_x % mayout_channel[c].factor;;
-                    int iy = block_y % mayout_channel[c].factor;
-
-                    int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
-                    const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
-/*
-                    uint8_t ch[16 * 16] = { 0 };
-                    img.component(c).ToPixels(block_xx * 8, block_yy * 8, 16, 16, ch, 1);
-*/
-                    CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_xx, block_yy, img.width(), img.height());
-
-                    // copy YUV16x16 corner to YUV8x8
-                    Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
-                }
-                else {
-                    const coeff_t * coeff_block = candidate_channel[c];
-                    CoeffToYUV16x16(coeff_block, &yuv16x16[c], mayout_channel[c].pixel, block_x, block_y, img.width(), img.height());
-                }
-            }
-        }
-
-        if (factor == 1)
-        {
-            std::vector< std::vector<float> > rgb0_c;
-            int block_8x8idx = GetOrigBlock(rgb0_c, 0, 0);
-/*
-            uint8_t yuv[3 * 8 * 8];
-
-            std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
-            {
-                int block_x = block_x_ * factor_x_ + off_x;
-                int block_y = block_y_ * factor_y_ + off_y;
-                int xmin = 8 * block_x;
-                int ymin = 8 * block_y;
-
-                img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
-
-                img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3);
-                img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3);
-                img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3);
-            }
-*/
-            int inside_x = block_x_ * 8 + 8 > width_ ? width_ - block_x_ * 8 : 8;
-            int inside_y = block_y_ * 8 + 8 > height_ ? height_ - block_y_ * 8 : 8;
-            std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-            YUVToImage(yuv8x8, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), 8, 8, inside_x, inside_y);
-/*
-            int count = 0;
-            for (int i = 0; i < 64; i++)
-            {
-                if (rgb1_c[0][i] != rgb1_c2[0][i] ||
-                    rgb1_c[1][i] != rgb1_c2[1][i] ||
-                    rgb1_c[2][i] != rgb1_c2[2][i])
-                {
-                    count++;
-                }
-            }
-            if (count > 0)
-            {
-                LogError("fdjskafjdlasfj");
-            }
-*/
-            return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
-        }
-        else
-        {
-            int inside_x = block_x_ * 16 + 16 > width_ ? width_ - block_x_ * 16 : 16;
-            int inside_y = block_y_ * 16 + 16 > height_ ? height_ - block_y_ * 16 : 16;
-/*
-            uint8_t yuv[3 * 8 * 8];
-            std::vector<std::vector<float> > rgb1_c2(3, std::vector<float>(kDCTBlockSize));
-            {
-                int block_x = block_x_ * factor_x_ + off_x;
-                int block_y = block_y_ * factor_y_ + off_y;
-                int xmin = 8 * block_x;
-                int ymin = 8 * block_y;
-
-                img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c2);
-
-                img.component(0).ToPixels(xmin, ymin, 8, 8, &yuv[0], 3);
-                img.component(1).ToPixels(xmin, ymin, 8, 8, &yuv[1], 3);
-                img.component(2).ToPixels(xmin, ymin, 8, 8, &yuv[2], 3);
-            }
-
-*/
-            float rgb16x16[3][16 * 16];
-            YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y);
-
-            std::vector< std::vector<float> > rgb0_c;
-            int block_8x8idx = GetOrigBlock(rgb0_c, off_x, off_y);
-
-            std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
-            Copy16x16ToChannel(rgb16x16, rgb1_c[0].data(), rgb1_c[1].data(), rgb1_c[2].data(), off_x, off_y);
-
-            return ComputeImage8x8Block(rgb0_c, rgb1_c, block_8x8idx);
-        }
-    }
-
-    double ButteraugliComparatorEx::ComputeImage8x8Block(std::vector<std::vector<float> > &rgb0_c,
-        std::vector<std::vector<float> > &rgb1_c,
-        int block_8x8idx) const
-    {
-        ::butteraugli::OpsinDynamicsImage(8, 8, rgb0_c);
-        ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
-
-        std::vector<std::vector<float> > rgb0 = rgb0_c;
-        std::vector<std::vector<float> > rgb1 = rgb1_c;
-
-        ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1);
-
-        double b0[3 * kDCTBlockSize];
-        double b1[3 * kDCTBlockSize];
-        for (int c = 0; c < 3; ++c) {
-            for (int ix = 0; ix < kDCTBlockSize; ++ix) {
-                b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
-                b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
-            }
-        }
-        double diff_xyz_dc[3] = { 0.0 };
-        double diff_xyz_ac[3] = { 0.0 };
-        double diff_xyz_edge_dc[3] = { 0.0 };
-        ::butteraugli::ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
-
-        double diff = 0.0;
-        double diff_edge = 0.0;
-        for (int c = 0; c < 3; ++c) {
-            diff += diff_xyz_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c];
-            diff += diff_xyz_ac[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c];
-            diff_edge += diff_xyz_edge_dc[c] * imgMaskXyzScaleBlockList[block_8x8idx * 3 + c];
-        }
-        const double kEdgeWeight = 0.05;
-        return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
-    }
-
-    int ButteraugliComparatorEx::getCurrentBlockIdx(void) const
-    {
-        const int block_width = (width_ + 8 * factor_x_ - 1) / (8 * factor_x_);
-        const int block_height = (height_ + 8 * factor_y_ - 1) / (8 * factor_y_);
-
-        return block_y_ * block_width + block_x_;
-    }
-
-    int ButteraugliComparatorEx::getCurrentBlock8x8Idx(int off_x, int off_y) const
-    {
-        int block_xx = block_x_ * factor_x_ + off_x;
-        int block_yy = block_y_ * factor_y_ + off_y;
-
-        const int block8_width =  (width_ + 8 - 1) / 8;
-        return block_yy * block8_width + block_xx;
-    }
-}
diff --git a/clguetzli/clguetzli_comparator.h b/clguetzli/clguetzli_comparator.h
deleted file mode 100644
index 721fcb32..00000000
--- a/clguetzli/clguetzli_comparator.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-#include <vector>
-#include "guetzli\butteraugli_comparator.h"
-
-namespace guetzli {
-
-	class ButteraugliComparatorEx : public ButteraugliComparator
-	{
-	public:
-		ButteraugliComparatorEx(const int width, const int height,
-			const std::vector<uint8_t>* rgb,
-			const float target_distance, ProcessStats* stats);
-
-		void StartBlockComparisons() override;
-        void FinishBlockComparisons() override;
-		void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
-
-        double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
-		double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const;
-	public:
-		std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
-        std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
-	};
-
-}
\ No newline at end of file
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 0b8df8b4..c188acc9 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -64,7 +64,6 @@ enum KernelName {
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
 	KERNEL_EDGEDETECTORLOWFREQ,
-    KERNEL_COMPUTEBLOCKZEROINGORDER,
     KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR,
 	KERNEL_COUNT,
 };

From d0949f18db593ac5a734317173f5e7972860c1aa Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 21 May 2017 00:22:44 +0800
Subject: [PATCH 095/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl     |  2 +-
 clguetzli/clguetzli.cl.cpp |  2 +
 clguetzli/clguetzli.cpp    |  2 +-
 guetzli.vcxproj            |  1 -
 guetzli.vcxproj.filters    |  3 --
 guetzli/processor.cc       | 81 --------------------------------------
 6 files changed, 4 insertions(+), 87 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 793e0f0b..3d6b34fb 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -2953,7 +2953,7 @@ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
 
 double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
 {
-    CalcOpsinDynamicsImage(rgb0_c);
+//    CalcOpsinDynamicsImage(rgb0_c);
     CalcOpsinDynamicsImage(rgb1_c);
 
     float rgb0[3][kDCTBlockSize];
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 9e6f7b87..4075ef94 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -74,6 +74,8 @@ namespace guetzli
                     }
                 }
 
+                CalcOpsinDynamicsImage((float(*)[64])curR);
+
                 int xmin = block_x * 8;
                 int ymin = block_y * 8;
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index d04e8c1c..ea85dabd 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1217,7 +1217,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
     cl_float clBlockErrorLimit = BlockErrorLimit;
 
-    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
+    cl_kernel kernel = 0;// ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch);
     clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index c914d909..11a2f227 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -195,7 +195,6 @@
   <ItemGroup>
     <ClInclude Include="clguetzli\clbutter_comparator.h" />
     <ClInclude Include="clguetzli\clguetzli.h" />
-    <ClInclude Include="clguetzli\clguetzli_comparator.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\ocl.h" />
     <ClInclude Include="clguetzli\utils.h" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 98009b47..4dd1b5ee 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -309,9 +309,6 @@
     <ClInclude Include="clguetzli\clbutter_comparator.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
-    <ClInclude Include="clguetzli\clguetzli_comparator.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 88079303..7a8612c7 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -752,87 +752,6 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
         candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
 
 }
-/*
-void Processor::ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
-    const int block_x, const int block_y, std::vector<CoeffData>* output_order)
-{
-    static const uint8_t oldCsf[kDCTBlockSize] = {
-        10, 10, 20, 40, 60, 70, 80, 90,
-        10, 20, 30, 60, 70, 80, 90, 90,
-        20, 30, 60, 70, 80, 90, 90, 90,
-        40, 60, 70, 80, 90, 90, 90, 90,
-        60, 70, 80, 90, 90, 90, 90, 90,
-        70, 80, 90, 90, 90, 90, 90, 90,
-        80, 90, 90, 90, 90, 90, 90, 90,
-        90, 90, 90, 90, 90, 90, 90, 90,
-    };
-    static const double kWeight[3] = { 1.0, 0.22, 0.20 };
-#include "guetzli/order.inc"
-    std::vector<std::pair<int, float> > input_order;
-    for (int c = 0; c < 3; ++c) {
-        for (int k = 1; k < kDCTBlockSize; ++k) {
-            int idx = c * kDCTBlockSize + k;
-            if (block[idx] != 0) {
-                float score;
-                if (params_.new_zeroing_model) {
-                    score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
-                }
-                else {
-                    score = static_cast<float>((std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) * kWeight[c] / oldCsf[k]);
-                }
-                input_order.push_back(std::make_pair(idx, score));
-            }
-        }
-    }
-    std::sort(input_order.begin(), input_order.end(), [](const std::pair<int, float>& a, const std::pair<int, float>& b) { return a.second < b.second; });
-
-    coeff_t processed_block[kBlockSize];
-    memcpy(processed_block, block, sizeof(processed_block));
-
-    comparator_->SwitchBlock(block_x, block_y, 1, 1);
-
-    while (!input_order.empty()) {
-        float best_err = 1e17f;
-        int best_i = 0;
-        for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead, input_order.size()); ++i)
-        {
-            coeff_t candidate_block[kBlockSize];
-            memcpy(candidate_block, processed_block, sizeof(candidate_block));
-
-            const int idx = input_order[i].first;
-
-            candidate_block[idx] = 0; // TOBEREMOVE:�Ա�block������÷�ǰi�͵���0(i����input_order���ݱ仯���仯)���������ûضԱ�ͼ�������������Ӧblock��ȥ�����������ԱȲ��á�
-
-            float max_err = ((ButteraugliComparatorEx*)comparator_)->CompareBlockEx(img, 0, 0, candidate_block);
-            if (max_err < best_err) { // TOBEREMOVE:�ҳ���С����ֵ��i
-                best_err = max_err;
-                best_i = i;
-            }
-        }
-
-        int idx = input_order[best_i].first;
-        processed_block[idx] = 0;
-        input_order.erase(input_order.begin() + best_i);
-
-        output_order->push_back({ idx, best_err }); // TOBEREMOVE:����������������С�����idx����Ӧ���Ա�block�еĶ�Ӧλ����������Ϊ0,�Ƴ�input_order���ѡȡ��ǰֵ������output_order,����ʽ�����õ��Ա�ͼ����ȥ��
-    }
-
-    // TOBEREMOVE:�����Ƴ�err������error���Ƶ���أ�����ԭ�Ա�ͼ��ԭʼֵ��
-    // Make the block error values monotonic.
-    float min_err = 1e10;
-    for (int i = output_order->size() - 1; i >= 0; --i) {
-        min_err = std::min(min_err, (*output_order)[i].block_err);
-        (*output_order)[i].block_err = min_err;
-    }
-    // Cut off at the block error limit.
-    size_t num = 0;
-    while (num < output_order->size() &&
-        (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) {
-        ++num;
-    }
-    output_order->resize(num);
-}
-*/
 void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                                        const uint8_t comp_mask,
                                        const double target_mul,

From cc746ff03b0f7388a53d56cfdb2b2aaada7c6a0b Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 21 May 2017 01:01:21 +0800
Subject: [PATCH 096/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  |  5 +++--
 clguetzli/clguetzli.cpp | 13 ++++++-------
 clguetzli/clguetzli.h   |  2 +-
 clguetzli/ocl.h         |  2 +-
 guetzli/processor.cc    |  6 +-----
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 3d6b34fb..814c4157 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -3161,7 +3161,7 @@ double CompareBlockFactor(const channel_info mayout_channel[3],
 }
 
 // batch��ָ�Ѿ���ά��չ��Ϊ��һά��
-__kernel void clComputeBlockZeroingOrderFactor(
+__kernel void clComputeBlockZeroingOrder(
     __global const coeff_t *orig_batch_0,       // ԭʼͼ��ϵ��
     __global const coeff_t *orig_batch_1,       // ԭʼͼ��ϵ��
     __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
@@ -3169,12 +3169,14 @@ __kernel void clComputeBlockZeroingOrderFactor(
     __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
     int                    image_width,
     int                    image_height,
+
     __global const coeff_t *mayout_batch_0,     // �����ѡͼ��ϵ��
     __global const coeff_t *mayout_batch_1,     // �����ѡͼ��ϵ��
     __global const coeff_t *mayout_batch_2,     // �����ѡͼ��ϵ��
     __global const ushort  *mayout_pixel_0,
     __global const ushort  *mayout_pixel_1,
     __global const ushort  *mayout_pixel_2,
+
     channel_info            mayout_channel_0,
     channel_info            mayout_channel_1,
     channel_info            mayout_channel_2,
@@ -3217,7 +3219,6 @@ __kernel void clComputeBlockZeroingOrderFactor(
         }
     }
 
-
     DCTScoreData input_order_data[kComputeBlockSize];
     CoeffData    output_order_data[kComputeBlockSize];
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index ea85dabd..ba271160 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -63,7 +63,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
 	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
 	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
-    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderFactor", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err);
 
 	return ocl;
 }
@@ -1251,7 +1251,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     clReleaseMemObject(mem_output_order_batch);
 }
 
-void clComputeBlockZeroingOrderFactor(
+void clComputeBlockZeroingOrder(
     const channel_info orig_channel[3],
     const float *orig_image_batch,
     const float *mask_scale,
@@ -1285,7 +1285,6 @@ void clComputeBlockZeroingOrderFactor(
         mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
 
         mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
-
     }
     cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
     cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
@@ -1293,12 +1292,12 @@ void clComputeBlockZeroingOrderFactor(
     int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
     cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size);
     cl_float clBlockErrorLimit = BlockErrorLimit;
-    cl_int  clWidth = image_width;
-    cl_int  clHeight = image_height;
-    cl_int  clFactor = factor;
+    cl_int clWidth = image_width;
+    cl_int clHeight = image_height;
+    cl_int clFactor = factor;
     cl_int clMask = comp_mask;
 
-    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR];
+    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
     clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 3a20eaa1..cd5a1524 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -25,7 +25,7 @@ void clComputeBlockZeroingOrder(const coeff_t *orig_batch,
     float BlockErrorLimit,
     guetzli::CoeffData *output_order_batch);
 
-void clComputeBlockZeroingOrderFactor(
+void clComputeBlockZeroingOrder(
     const channel_info orig_channel[3],
     const float *orig_image_batch,
     const float *mask_scale,
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index c188acc9..15e115af 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -64,7 +64,7 @@ enum KernelName {
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
 	KERNEL_EDGEDETECTORLOWFREQ,
-    KERNEL_COMPUTEBLOCKZEROINGORDERFACTOR,
+    KERNEL_COMPUTEBLOCKZEROINGORDER,
 	KERNEL_COUNT,
 };
 
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 7a8612c7..7540e470 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -74,10 +74,6 @@ class Processor {
       const int block_x, const int block_y, const int factor_x,
       const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2,
       std::vector<CoeffData>* output_order);
-  /*
-  void ComputeBlockZeroingOrder(const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
-      const int block_x, const int block_y, std::vector<CoeffData>* output_order);
-  */
 
   bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
                          int best_q[3][kDCTBlockSize],
@@ -659,7 +655,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
         output_order_gpu.resize(num_blocks * kBlockSize);
         output_order = output_order_gpu.data();
 
-        clComputeBlockZeroingOrderFactor(orig_channel,
+        clComputeBlockZeroingOrder(orig_channel,
                                         comp->imgOpsinDynamicsBlockList.data(),
                                         comp->imgMaskXyzScaleBlockList.data(),
                                         width,

From 1f87bb2244092f215012e6ff8c6ec30bb409c7ab Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 21 May 2017 01:04:23 +0800
Subject: [PATCH 097/189] =?UTF-8?q?=E6=B8=85=E7=90=86=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli/processor.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 7540e470..32cb13bb 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -31,7 +31,6 @@
 #include "guetzli/jpeg_data_writer.h"
 #include "guetzli/output_image.h"
 #include "guetzli/quantize.h"
-#include "clguetzli\clguetzli_comparator.h"
 #include "clguetzli\clguetzli.h"
 
 namespace guetzli {

From add84365e698c488ccc5cec9d66d05b8ea3ef872 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 22 May 2017 09:36:32 +0800
Subject: [PATCH 098/189] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=E4=BA=8B=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 11a2f227..3b6abf4e 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -129,7 +129,8 @@
     </CustomBuild>
     <Intel_OpenCL_Build_Rules />
     <PostBuildEvent>
-      <Command>copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl</Command>
+      <Command>
+      </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -171,7 +172,8 @@
       <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
-      <Command>copy $(ProjectDir)\clguetzli\clguetzli.cl $(ProjectDir)\clguetzli.cl</Command>
+      <Command>
+      </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@@ -384,7 +386,6 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
-      <Include Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">"C:\Users\strongtu\Documents\Project\git_strong\guetzli\clguetzli";%(Include)</Include>
     </Intel_OpenCL_Build_Rules>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />

From 8f80356104d359a7f855e76749689b7caa610c7d Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 22 May 2017 11:06:18 +0800
Subject: [PATCH 099/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl.cpp                    |  54 +-
 clguetzli/clguetzli.cpp                       |  57 --
 clguetzli/clguetzli.h                         |  10 -
 guetzli.vcxproj                               |   1 +
 guetzli.vcxproj.filters                       |   3 +
 guetzli/processor.cc                          | 494 +++++++-----------
 .../butteraugli/butteraugli/butteraugli.cc    |  35 +-
 7 files changed, 213 insertions(+), 441 deletions(-)

diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 4075ef94..0a05b038 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -93,47 +93,37 @@ namespace guetzli
         imgMaskXyzScaleBlockList.clear();
     }
 
-    void ButteraugliComparatorEx::SwitchBlock(int block_x, int block_y, int factor_x, int factor_y)
-    {
-        ButteraugliComparator::SwitchBlock(block_x, block_y, factor_x, factor_y);
-    }
-
     double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
         double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
-        return err;
         if (g_checkOpenCL)
         {
-            double err1 = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
-            if (err1 != err)
+            channel_info mayout_channel[3];
+            for (int c = 0; c < 3; c++)
             {
-                LogError("CHK %s(%d) \r\n", __FUNCTION__, __LINE__);
+                mayout_channel[c].block_height = img.component(c).height_in_blocks();
+                mayout_channel[c].block_width = img.component(c).width_in_blocks();
+                mayout_channel[c].factor = img.component(c).factor_x();
+                mayout_channel[c].pixel = img.component(c).pixels();
+                mayout_channel[c].coeff = img.component(c).coeffs();
             }
-        }
 
-        return err;
-    }
-
-    double ButteraugliComparatorEx::CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
-    {
-        channel_info mayout_channel[3];
-        for (int c = 0; c < 3; c++)
-        {
-            mayout_channel[c].block_height = img.component(c).height_in_blocks();
-            mayout_channel[c].block_width = img.component(c).width_in_blocks();
-            mayout_channel[c].factor = img.component(c).factor_x();
-            mayout_channel[c].pixel = img.component(c).pixels();
-            mayout_channel[c].coeff = img.component(c).coeffs();
+            double err2 = CompareBlockFactor(mayout_channel,
+                candidate_block,
+                block_x_,
+                block_y_,
+                imgOpsinDynamicsBlockList.data(),
+                imgMaskXyzScaleBlockList.data(),
+                width_,
+                height_,
+                factor_x_);
+
+            if (err != err2)
+            {
+                LogError("CompareBlock miss %s(%d) \r\n", __FUNCTION__, __LINE__);
+            }
         }
 
-        return CompareBlockFactor(mayout_channel,
-            candidate_block,
-            block_x_,
-            block_y_,
-            imgOpsinDynamicsBlockList.data(),
-            imgMaskXyzScaleBlockList.data(),
-            width_,
-            height_,
-            factor_x_);
+        return err;
     }
 }
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index ba271160..67eb4918 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1194,63 +1194,6 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	clReleaseMemObject(mem_result);
 }
 
-// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
-void clComputeBlockZeroingOrder(const guetzli::coeff_t *orig_batch,     // ԭʼͼ��ϵ��
-                                const float *orig_image_batch,          // ԭʼͼ��pregamma��
-                                const float* orig_mask_scale_batch,     // ԭʼͼ���ĳ�����ز���
-                                const guetzli::coeff_t *mayout_batch,   // �����ѡͼ��ϵ��
-                                int size,                               //
-                                float BlockErrorLimit,
-                                guetzli::CoeffData *output_order_batch) //
-{
-    using namespace guetzli;
-
-    int item_count = 3 * kDCTBlockSize * size;
-
-    cl_int err = 0;
-    ocl_args_d_t &ocl = getOcl();
-
-    cl_mem mem_orig_batch         = ocl.allocMem(sizeof(::coeff_t) * item_count, orig_batch);
-    cl_mem mem_orig_image_batch   = ocl.allocMem(sizeof(float) * item_count, orig_image_batch);
-    cl_mem mem_mask_scale_batch   = ocl.allocMem(sizeof(float) * 3 * size, orig_mask_scale_batch);
-    cl_mem mem_mayout_batch       = ocl.allocMem(sizeof(::coeff_t) * item_count, mayout_batch);
-    cl_mem mem_output_order_batch = ocl.allocMem(sizeof(CoeffData) * item_count);
-    cl_float clBlockErrorLimit = BlockErrorLimit;
-
-    cl_kernel kernel = 0;// ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_batch);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_image_batch);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_mask_scale_batch);
-    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_mayout_batch);
-    clSetKernelArg(kernel, 4, sizeof(cl_float), &clBlockErrorLimit);
-    clSetKernelArg(kernel, 5, sizeof(cl_mem), &mem_output_order_batch);
-
-    size_t globalWorkSize[1] = { size };
-    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-    if (CL_SUCCESS != err)
-    {
-        LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-    }
-    err = clFinish(ocl.commandQueue);
-    if (CL_SUCCESS != err)
-    {
-        LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
-    }
-
-    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, sizeof(CoeffData) * item_count, 0, NULL, NULL, &err);
-    err = clFinish(ocl.commandQueue);
-    memcpy(output_order_batch, result, sizeof(CoeffData) * item_count);
-
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL);
-    clFinish(ocl.commandQueue);
-
-    clReleaseMemObject(mem_orig_batch);
-    clReleaseMemObject(mem_orig_image_batch);
-    clReleaseMemObject(mem_mask_scale_batch);
-    clReleaseMemObject(mem_mayout_batch);
-    clReleaseMemObject(mem_output_order_batch);
-}
-
 void clComputeBlockZeroingOrder(
     const channel_info orig_channel[3],
     const float *orig_image_batch,
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index cd5a1524..4e6f3209 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -17,14 +17,6 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
     size_t step,
     float* result);
 
-void clComputeBlockZeroingOrder(const coeff_t *orig_batch,
-    const float *orig_image_batch,
-    const float* orig_mask_scale_batch,
-    const coeff_t *mayout_batch,
-    int size,
-    float BlockErrorLimit,
-    guetzli::CoeffData *output_order_batch);
-
 void clComputeBlockZeroingOrder(
     const channel_info orig_channel[3],
     const float *orig_image_batch,
@@ -108,10 +100,8 @@ namespace guetzli {
 
         void StartBlockComparisons() override;
         void FinishBlockComparisons() override;
-        void SwitchBlock(int block_x, int block_y, int factor_x, int factor_y) override;
 
         double CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const override;
-        double CompareBlockEx(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const;
     public:
         std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 3b6abf4e..42a13971 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -196,6 +196,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="clguetzli\clbutter_comparator.h" />
+    <ClInclude Include="clguetzli\clguetzli.cl.h" />
     <ClInclude Include="clguetzli\clguetzli.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\ocl.h" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 4dd1b5ee..fc895c38 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -309,6 +309,9 @@
     <ClInclude Include="clguetzli\clbutter_comparator.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli.cl.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 32cb13bb..43b513dc 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -51,14 +51,9 @@ class Processor {
                        ProcessStats* stats);
 
  private:
-
-  void SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img,
-                              const uint8_t comp_mask, const double target_mul,
-                              bool stop_early, const OutputImage &img2);
-
   void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
                               const uint8_t comp_mask, const double target_mul,
-                              bool stop_early, const OutputImage &img2);
+                              bool stop_early);
 
   void SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
       const uint8_t comp_mask,
@@ -71,7 +66,7 @@ class Processor {
   void ComputeBlockZeroingOrder(
       const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
       const int block_x, const int block_y, const int factor_x,
-      const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage& img2,
+      const int factor_y, const uint8_t comp_mask, OutputImage* img,
       std::vector<CoeffData>* output_order);
 
   bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
@@ -350,7 +345,6 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
   const float target_mul_low = 0.95f;
 
   QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q, img);
-
   for (;;) {
     int q_next[3][kDCTBlockSize];
     if (!qgen.GetNext(q_next)) {
@@ -379,7 +373,7 @@ bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
 void Processor::ComputeBlockZeroingOrder(
     const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
     const int block_x, const int block_y, const int factor_x,
-    const int factor_y, const uint8_t comp_mask, OutputImage* img, const OutputImage &img2,
+    const int factor_y, const uint8_t comp_mask, OutputImage* img,
     std::vector<CoeffData>* output_order) {
   static const uint8_t oldCsf[kDCTBlockSize] = {
       10, 10, 20, 40, 60, 70, 80, 90,
@@ -416,19 +410,6 @@ void Processor::ComputeBlockZeroingOrder(
   coeff_t processed_block[kBlockSize];
   memcpy(processed_block, block, sizeof(processed_block));
   comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
-
-  bool bCheck = false;
-  uint8_t orig_rgb[3][16 * 16] = { 0 };
-  if (bCheck)
-  {
-      for (int c = 0; c < 3; ++c) {
-          if (comp_mask & (1 << c) && factor_x == 2) {
-              if ((block_x + 1) * factor_x * 8 > img->width()) continue;
-              img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, orig_rgb[c], 1);
-          }
-      }
-  }
-
   while (!input_order.empty()) {
     float best_err = 1e17f;
     int best_i = 0;
@@ -460,36 +441,6 @@ void Processor::ComputeBlockZeroingOrder(
         best_err = max_err;
         best_i = i;
       }
-
-      if (bCheck)
-      {
-          // ÿ�ζ�Ҫ�ָ�һ�¿���
-          for (int c = 0; c < 3; ++c) {
-              if (comp_mask & (1 << c)) {
-                  img->component(c).SetCoeffBlock(block_x, block_y, &block[c * kDCTBlockSize]);
-              }
-          }
-          // �������ٿ��ǲ��ǻָ���
-          uint8_t last_rgb[3][16 * 16] = { 0 };
-          for (int c = 0; c < 3; ++c) {
-              if (comp_mask & (1 << c) && factor_x == 2) {
-                  if ((block_x + 1) * factor_x * 8 > img->width()) continue;
-                  img->component(c).ToPixels((block_x + 1) * factor_x * 8, block_y * factor_y * 8, 16, 16, last_rgb[c], 1);
-              }
-          }
-          int count = 0;
-          for (int c = 0; c < 3; c++) {
-              for (int k = 0; factor_x == 2 && k < 16 * 16; k++) {
-                  if (last_rgb[c][k] != orig_rgb[c][k]) {
-                      count++;
-                  }
-              }
-          }
-          if (count > 0)
-          {
-              LogError("misstake in processing %d:%d block=%d:%d\r\n", count, 16 * 16, block_x, block_y);
-          }
-      }
     }
     int idx = input_order[best_i].first;
     processed_block[idx] = 0;
@@ -522,23 +473,6 @@ void Processor::ComputeBlockZeroingOrder(
           block_x, block_y, &block[c * kDCTBlockSize]);
     }
   }
-
-  if (bCheck)
-  {
-      // ȫͼ���һ��
-      for (int c = 0; c < 3; c++)
-      {
-          int size = img->component(c).pixels_size();
-          if (!(comp_mask & (1 << c))) continue;
-          for (int k = 0; k < size && factor_x == 2; k++)
-          {
-              if (img2.component(c).pixels()[k] != img->component(c).pixels()[k])
-              {
-                  LogError("misstake in restore\r\n");
-              }
-          }
-      }
-  }
 }
 
 namespace {
@@ -611,8 +545,8 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 
 }  // namespace
 
-void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask,
-                                       const double target_mul, bool stop_early, const OutputImage &img2)
+void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask,
+                                       const double target_mul, bool stop_early)
 {
     const int width = img->width();
     const int height = img->height();
@@ -689,7 +623,7 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
                 }
 
                 std::vector<CoeffData> block_order;
-                ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, img2, &block_order);
+                ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x, factor_y, comp_mask, img, &block_order);
 
                 CoeffData * p = &output_order_cpu[block_ix * kBlockSize];
                 for (int i = 0; i < block_order.size(); i++)
@@ -747,64 +681,6 @@ void Processor::SelectFrequencyMaskingBatch(const JPEGData& jpg, OutputImage* im
         candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
 
 }
-void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
-                                       const uint8_t comp_mask,
-                                       const double target_mul,
-                                       bool stop_early,
-                                       const OutputImage& img2) {
-  const int width = img->width();
-  const int height = img->height();
-  const int ncomp = jpg.components.size();
-  const int last_c = Log2FloorNonZero(comp_mask);
-  if (static_cast<size_t>(last_c) >= jpg.components.size()) return;
-  const int factor_x = img->component(last_c).factor_x();
-  const int factor_y = img->component(last_c).factor_y();
-  const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
-  const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
-  const int num_blocks = block_width * block_height;
-
-  std::vector<int> candidate_coeff_offsets(num_blocks + 1);
-  std::vector<uint8_t> candidate_coeffs;
-  std::vector<float> candidate_coeff_errors;
-  candidate_coeffs.reserve(60 * num_blocks);
-  candidate_coeff_errors.reserve(60 * num_blocks);
-  std::vector<CoeffData> block_order;
-  block_order.reserve(3 * kDCTBlockSize);
-  comparator_->StartBlockComparisons();
-  for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
-    for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-      coeff_t block[kBlockSize] = { 0 };
-      coeff_t orig_block[kBlockSize] = { 0 };
-      for (int c = 0; c < 3; ++c) {
-        if (comp_mask & (1 << c)) {
-          assert(img->component(c).factor_x() == factor_x);
-          assert(img->component(c).factor_y() == factor_y);
-          img->component(c).GetCoeffBlock(block_x, block_y,
-                                          &block[c * kDCTBlockSize]);
-          const JPEGComponent& comp = jpg.components[c];
-          int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
-          memcpy(&orig_block[c * kDCTBlockSize],
-                 &comp.coeffs[jpg_block_ix * kDCTBlockSize],
-                 kDCTBlockSize * sizeof(orig_block[0]));
-        }
-      }
-      block_order.clear();
-      ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x,
-                               factor_y, comp_mask, img, img2, &block_order);
-
-      candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
-      for (size_t i = 0; i < block_order.size(); ++i) {
-        candidate_coeffs.push_back(block_order[i].idx);
-        candidate_coeff_errors.push_back(block_order[i].block_err);
-      }
-    }
-  }
-  comparator_->FinishBlockComparisons();
-  candidate_coeff_offsets[num_blocks] = candidate_coeffs.size();
-
-  SelectFrequencyBackEnd(jpg, img, comp_mask, target_mul, stop_early,
-      candidate_coeff_offsets, candidate_coeffs, candidate_coeff_errors);
-}
 
 void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
                                         const uint8_t comp_mask,
@@ -825,183 +701,183 @@ void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
     const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
     const int num_blocks = block_width * block_height;
 
-    std::vector<JpegHistogram> ac_histograms(ncomp);
-    int jpg_header_size, dc_size;
-    {
-        JPEGData jpg_out = jpg;
-        img->SaveToJpegData(&jpg_out);
-        jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata);
-        dc_size = EstimateDCSize(jpg_out);
-        BuildACHistograms(jpg_out, &ac_histograms[0]);
-    }
-    std::vector<uint8_t> ac_depths;
-    int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
-    int base_size = jpg_header_size + dc_size + ac_histogram_size +
-        EntropyCodedDataSize(ac_histograms, ac_depths);
-    int prev_size = base_size;
-
-    std::vector<float> max_block_error(num_blocks);
-    std::vector<int> last_indexes(num_blocks);
-
-    bool first_up_iter = true;
-    for (int direction : {1, -1}) {
-        for (;;) {
-            if (stop_early && direction == -1) {
-                if (prev_size > 1.01 * final_output_->jpeg_data.size()) {
-                    // If we are down-adjusting the error, the output size will only keep
-                    // increasing.
-                    // TODO(user): Do this check always by comparing only the size
-                    // of the currently processed components.
-                    break;
-                }
+  std::vector<JpegHistogram> ac_histograms(ncomp);
+  int jpg_header_size, dc_size;
+  {
+    JPEGData jpg_out = jpg;
+    img->SaveToJpegData(&jpg_out);
+    jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata);
+    dc_size = EstimateDCSize(jpg_out);
+    BuildACHistograms(jpg_out, &ac_histograms[0]);
+  }
+  std::vector<uint8_t> ac_depths;
+  int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
+  int base_size = jpg_header_size + dc_size + ac_histogram_size +
+      EntropyCodedDataSize(ac_histograms, ac_depths);
+  int prev_size = base_size;
+
+  std::vector<float> max_block_error(num_blocks);
+  std::vector<int> last_indexes(num_blocks);
+
+  bool first_up_iter = true;
+  for (int direction : {1, -1}) {
+    for (;;) {
+      if (stop_early && direction == -1) {
+        if (prev_size > 1.01 * final_output_->jpeg_data.size()) {
+          // If we are down-adjusting the error, the output size will only keep
+          // increasing.
+          // TODO(user): Do this check always by comparing only the size
+          // of the currently processed components.
+          break;
+        }
+      }
+      std::vector<std::pair<int, float> > global_order;
+      int blocks_to_change;
+      std::vector<float> block_weight;
+      for (int rblock = 1; rblock <= 4; ++rblock) {
+        block_weight = std::vector<float>(num_blocks);
+        std::vector<float> distmap(width * height);
+        if (!first_up_iter) {
+          distmap = comparator_->distmap();
+        }
+        comparator_->ComputeBlockErrorAdjustmentWeights(
+            direction, rblock, target_mul, factor_x, factor_y, distmap,
+            &block_weight);
+        global_order.clear();
+        blocks_to_change = 0;
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+          for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+            const int last_index = last_indexes[block_ix];
+            const int offset = candidate_coeff_offsets[block_ix];
+            const int num_candidates =
+                candidate_coeff_offsets[block_ix + 1] - offset;
+            const float* candidate_errors = &candidate_coeff_errors[offset];
+            const float max_err = max_block_error[block_ix];
+            if (block_weight[block_ix] == 0) {
+              continue;
             }
-            std::vector<std::pair<int, float> > global_order;
-            int blocks_to_change;
-            std::vector<float> block_weight;
-            for (int rblock = 1; rblock <= 4; ++rblock) {
-                block_weight = std::vector<float>(num_blocks);
-                std::vector<float> distmap(width * height);
-                if (!first_up_iter) {
-                    distmap = comparator_->distmap();
-                }
-                comparator_->ComputeBlockErrorAdjustmentWeights(
-                    direction, rblock, target_mul, factor_x, factor_y, distmap,
-                    &block_weight);
-                global_order.clear();
-                blocks_to_change = 0;
-                for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
-                    for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
-                        const int last_index = last_indexes[block_ix];
-                        const int offset = candidate_coeff_offsets[block_ix];
-                        const int num_candidates =
-                            candidate_coeff_offsets[block_ix + 1] - offset;
-                        const float* candidate_errors = &candidate_coeff_errors[offset];
-                        const float max_err = max_block_error[block_ix];
-                        if (block_weight[block_ix] == 0) {
-                            continue;
-                        }
-                        if (direction > 0) {
-                            for (size_t i = last_index; i < num_candidates; ++i) {
-                                float val = ((candidate_errors[i] - max_err) /
-                                    block_weight[block_ix]);
-                                global_order.push_back(std::make_pair(block_ix, val));
-                            }
-                            blocks_to_change += (last_index < num_candidates ? 1 : 0);
-                        } else {
-                            for (int i = last_index - 1; i >= 0; --i) {
-                                float val = ((max_err - candidate_errors[i]) /
-                                    block_weight[block_ix]);
-                                global_order.push_back(std::make_pair(block_ix, val));
-                            }
-                            blocks_to_change += (last_index > 0 ? 1 : 0);
-                        }
-                    }
-                }
-                if (!global_order.empty()) {
-                    // If we found something to adjust with the current block adjustment
-                    // radius, we can stop and adjust the blocks we have.
-                    break;
-                }
+            if (direction > 0) {
+              for (size_t i = last_index; i < num_candidates; ++i) {
+                float val = ((candidate_errors[i] - max_err) /
+                             block_weight[block_ix]);
+                global_order.push_back(std::make_pair(block_ix, val));
+              }
+              blocks_to_change += (last_index < num_candidates ? 1 : 0);
+            } else {
+              for (int i = last_index - 1; i >= 0; --i) {
+                float val = ((max_err - candidate_errors[i]) /
+                             block_weight[block_ix]);
+                global_order.push_back(std::make_pair(block_ix, val));
+              }
+              blocks_to_change += (last_index > 0 ? 1 : 0);
             }
+          }
+        }
+        if (!global_order.empty()) {
+          // If we found something to adjust with the current block adjustment
+          // radius, we can stop and adjust the blocks we have.
+          break;
+        }
+      }
 
-            if (global_order.empty()) {
-                break;
-            }
+      if (global_order.empty()) {
+        break;
+      }
 
-            std::sort(global_order.begin(), global_order.end(),
+      std::sort(global_order.begin(), global_order.end(),
                 [](const std::pair<int, float>& a,
-                    const std::pair<int, float>& b) {
-                return a.second < b.second; });
+                   const std::pair<int, float>& b) {
+                  return a.second < b.second; });
 
-            double rel_size_delta = direction > 0 ? 0.01 : 0.0005;
-            if (direction > 0 && comparator_->DistanceOK(1.0)) {
-                rel_size_delta = 0.05;
-            }
-            double min_size_delta = base_size * rel_size_delta;
-
-            float coeffs_to_change_per_block =
-                direction > 0 ? 2.0f : factor_x * factor_y * 0.2f;
-            int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change;
-
-            if (first_up_iter) {
-                const float limit = 0.75f * comparator_->BlockErrorLimit();
-                auto it = std::partition_point(global_order.begin(), global_order.end(),
-                    [=](const std::pair<int, float>& a) {
-                    return a.second < limit; });
-                min_coeffs_to_change = std::max<int>(min_coeffs_to_change,
-                    it - global_order.begin());
-                first_up_iter = false;
-            }
+      double rel_size_delta = direction > 0 ? 0.01 : 0.0005;
+      if (direction > 0 && comparator_->DistanceOK(1.0)) {
+        rel_size_delta = 0.05;
+      }
+      double min_size_delta = base_size * rel_size_delta;
+
+      float coeffs_to_change_per_block =
+          direction > 0 ? 2.0f : factor_x * factor_y * 0.2f;
+      int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change;
+
+      if (first_up_iter) {
+        const float limit = 0.75f * comparator_->BlockErrorLimit();
+        auto it = std::partition_point(global_order.begin(), global_order.end(),
+                                       [=](const std::pair<int, float>& a) {
+                                         return a.second < limit; });
+        min_coeffs_to_change = std::max<int>(min_coeffs_to_change,
+                                             it - global_order.begin());
+        first_up_iter = false;
+      }
 
-            std::set<int> changed_blocks;
-            float val_threshold = 0.0;
-            int changed_coeffs = 0;
-            int est_jpg_size = prev_size;
-            for (size_t i = 0; i < global_order.size(); ++i) {
-                const int block_ix = global_order[i].first;
-                const int block_x = block_ix % block_width;
-                const int block_y = block_ix / block_width;
-                const int last_idx = last_indexes[block_ix];
-                const int offset = candidate_coeff_offsets[block_ix];
-                const uint8_t* candidates = &candidate_coeffs[offset];
-                const int idx = candidates[last_idx + std::min(direction, 0)];
-                const int c = idx / kDCTBlockSize;
-                const int k = idx % kDCTBlockSize;
-                const int* quant = img->component(c).quant();
-                const JPEGComponent& comp = jpg.components[c];
-                const int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
-                const int newval = direction > 0 ? 0 : Quantize(
-                    comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]);
-                coeff_t block[kDCTBlockSize] = { 0 };
-                img->component(c).GetCoeffBlock(block_x, block_y, block);
-                UpdateACHistogram(-1, block, quant, &ac_histograms[c]);
-                block[k] = newval;
-                UpdateACHistogram(1, block, quant, &ac_histograms[c]);
-                img->component(c).SetCoeffBlock(block_x, block_y, block);
-                last_indexes[block_ix] += direction;
-                changed_blocks.insert(block_ix);
-                val_threshold = global_order[i].second;
-                ++changed_coeffs;
-                static const int kEntropyCodeUpdateFreq = 10;
-                if (i % kEntropyCodeUpdateFreq == 0) {
-                    ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
-                }
-                est_jpg_size = jpg_header_size + dc_size + ac_histogram_size +
-                    EntropyCodedDataSize(ac_histograms, ac_depths);
-                if (changed_coeffs > min_coeffs_to_change &&
-                    std::abs(est_jpg_size - prev_size) > min_size_delta) {
-                    break;
-                }
-            }
-            size_t global_order_size = global_order.size();
-            std::vector<std::pair<int, float>>().swap(global_order);
+      std::set<int> changed_blocks;
+      float val_threshold = 0.0;
+      int changed_coeffs = 0;
+      int est_jpg_size = prev_size;
+      for (size_t i = 0; i < global_order.size(); ++i) {
+        const int block_ix = global_order[i].first;
+        const int block_x = block_ix % block_width;
+        const int block_y = block_ix / block_width;
+        const int last_idx = last_indexes[block_ix];
+        const int offset = candidate_coeff_offsets[block_ix];
+        const uint8_t* candidates = &candidate_coeffs[offset];
+        const int idx = candidates[last_idx + std::min(direction, 0)];
+        const int c = idx / kDCTBlockSize;
+        const int k = idx % kDCTBlockSize;
+        const int* quant = img->component(c).quant();
+        const JPEGComponent& comp = jpg.components[c];
+        const int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+        const int newval = direction > 0 ? 0 : Quantize(
+            comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]);
+        coeff_t block[kDCTBlockSize] = { 0 };
+        img->component(c).GetCoeffBlock(block_x, block_y, block);
+        UpdateACHistogram(-1, block, quant, &ac_histograms[c]);
+        block[k] = newval;
+        UpdateACHistogram(1, block, quant, &ac_histograms[c]);
+        img->component(c).SetCoeffBlock(block_x, block_y, block);
+        last_indexes[block_ix] += direction;
+        changed_blocks.insert(block_ix);
+        val_threshold = global_order[i].second;
+        ++changed_coeffs;
+        static const int kEntropyCodeUpdateFreq = 10;
+        if (i % kEntropyCodeUpdateFreq == 0) {
+          ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
+        }
+        est_jpg_size = jpg_header_size + dc_size + ac_histogram_size +
+            EntropyCodedDataSize(ac_histograms, ac_depths);
+        if (changed_coeffs > min_coeffs_to_change &&
+            std::abs(est_jpg_size - prev_size) > min_size_delta) {
+          break;
+        }
+      }
+      size_t global_order_size = global_order.size();
+      std::vector<std::pair<int, float>>().swap(global_order);
 
-            for (int i = 0; i < num_blocks; ++i) {
-                max_block_error[i] += block_weight[i] * val_threshold * direction;
-            }
+      for (int i = 0; i < num_blocks; ++i) {
+        max_block_error[i] += block_weight[i] * val_threshold * direction;
+      }
 
-            ++stats_->counters[kNumItersCnt];
-            ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt];
-            std::string encoded_jpg;
-            {
-                JPEGData jpg_out = jpg;
-                img->SaveToJpegData(&jpg_out);
-                OutputJpeg(jpg_out, &encoded_jpg);
-            }
-            GUETZLI_LOG(stats_,
-                "Iter %2d: %s(%d) %s Coeffs[%d/%zd] "
-                "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]",
-                stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(),
-                comp_mask, direction > 0 ? "up" : "down", changed_coeffs,
-                global_order_size, changed_blocks.size(),
-                blocks_to_change, num_blocks, val_threshold,
-                encoded_jpg.size(),
-                100.0 - (100.0 * est_jpg_size) / encoded_jpg.size());
-            comparator_->Compare(*img);
-            MaybeOutput(encoded_jpg);
-            prev_size = est_jpg_size;
-        }
+      ++stats_->counters[kNumItersCnt];
+      ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt];
+      std::string encoded_jpg;
+      {
+        JPEGData jpg_out = jpg;
+        img->SaveToJpegData(&jpg_out);
+        OutputJpeg(jpg_out, &encoded_jpg);
+      }
+      GUETZLI_LOG(stats_,
+                  "Iter %2d: %s(%d) %s Coeffs[%d/%zd] "
+                  "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]",
+                  stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(),
+                  comp_mask, direction > 0 ? "up" : "down", changed_coeffs,
+                  global_order_size, changed_blocks.size(),
+                  blocks_to_change, num_blocks, val_threshold,
+                  encoded_jpg.size(),
+                  100.0 - (100.0 * est_jpg_size) / encoded_jpg.size());
+      comparator_->Compare(*img);
+      MaybeOutput(encoded_jpg);
+      prev_size = est_jpg_size;
     }
+  }
 }
 
 bool IsGrayscale(const JPEGData& jpg) {
@@ -1096,28 +972,12 @@ bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
     img.CopyFromJpegData(jpg);
     img.ApplyGlobalQuantization(best_q);
 
-    OutputImage img2(jpg.width, jpg.height);
-    img2.CopyFromJpegData(jpg);
-    img2.ApplyGlobalQuantization(best_q);
-
-    for (int c = 0; c < 3; c++)
-    {
-        int size = img.component(c).pixels_size();
-        for (int k = 0; k < size; k++)
-        {
-            if (img2.component(c).pixels()[k] != img.component(c).pixels()[k])
-            {
-                LogError("fdjsalfjlkadsfdsafjdsfjdlsajdklsjf\r\n");
-            }
-        }
-    }
-
     if (!downsample) {
-      SelectFrequencyMaskingBatch(jpg, &img, 7, 1.0, false, img2);
+      SelectFrequencyMasking(jpg, &img, 7, 1.0, false);
     } else {
       const float ymul = jpg.components.size() == 1 ? 1.0f : 0.97f;
-      SelectFrequencyMaskingBatch(jpg, &img, 1, ymul, false, img2);
-      SelectFrequencyMaskingBatch(jpg, &img, 6, 1.0, true, img2);
+      SelectFrequencyMasking(jpg, &img, 1, ymul, false);
+      SelectFrequencyMasking(jpg, &img, 6, 1.0, true);
     }
   }
 
@@ -1156,7 +1016,7 @@ bool Process(const Params& params, ProcessStats* stats,
   if (stats == nullptr) {
     stats = &dummy_stats;
   }
-  std::unique_ptr<ButteraugliComparatorEx> comparator;
+  std::unique_ptr<ButteraugliComparator> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
     comparator.reset(
         new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 288bee78..69511051 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1315,41 +1315,26 @@ void _MinSquareVal(size_t square_size, size_t offset,
   // offset is not negative and smaller than square_size.
   assert(offset < square_size);
   std::vector<float> tmp(xsize * ysize);
-
   for (size_t y = 0; y < ysize; ++y) {
     const size_t minh = offset > y ? 0 : y - offset;
     const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
-
-    float *pTmpPoint = &tmp[y * xsize];
-    float *pValuePoint = &values[minh * xsize];
-
     for (size_t x = 0; x < xsize; ++x) {
-        float *pValues = pValuePoint++;
-        float min = *pValues;
-
-        for (size_t j = minh + 1; j < maxh; ++j) {
-            pValues += xsize;
-            if (*pValues < min) min = *pValues;
-        }
-        *pTmpPoint++ = min;
+      double min = values[x + minh * xsize];
+      for (size_t j = minh + 1; j < maxh; ++j) {
+        min = fmin(min, values[x + j * xsize]);
+      }
+      tmp[x + y * xsize] = static_cast<float>(min);
     }
   }
   for (size_t x = 0; x < xsize; ++x) {
     const size_t minw = offset > x ? 0 : x - offset;
     const size_t maxw = std::min<size_t>(xsize, x + square_size - offset);
-
-    float *pValuePoint = &values[x];
-    float *pTmpPoint = &tmp[minw];
-
     for (size_t y = 0; y < ysize; ++y) {
-        float * pTmp = pTmpPoint; pTmpPoint += xsize;
-        float min = *pTmp;
-
-        for (size_t j = minw + 1; j < maxw; ++j) {
-            pTmp++;
-            if (*pTmp < min) min = *pTmp;
-        }
-        *pValuePoint = min; pValuePoint += xsize;
+      double min = tmp[minw + y * xsize];
+      for (size_t j = minw + 1; j < maxw; ++j) {
+        min = fmin(min, tmp[j + y * xsize]);
+      }
+      values[x + y * xsize] = static_cast<float>(min);
     }
   }
 }

From f766120bc1fa6dadd6eee4e6a3c44724b918cf90 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 22 May 2017 11:22:36 +0800
Subject: [PATCH 100/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl   |  16 ++---
 clguetzli/clguetzli.cl.h | 141 +++++++++++++++++++--------------------
 2 files changed, 77 insertions(+), 80 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 814c4157..0f97ad8e 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -3167,8 +3167,8 @@ __kernel void clComputeBlockZeroingOrder(
     __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
     __global const float   *orig_image_batch,   // ԭʼͼ��pregamma
     __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
-    int                    image_width,
-    int                    image_height,
+    const int              image_width,
+    const int              image_height,
 
     __global const coeff_t *mayout_batch_0,     // �����ѡͼ��ϵ��
     __global const coeff_t *mayout_batch_1,     // �����ѡͼ��ϵ��
@@ -3177,12 +3177,12 @@ __kernel void clComputeBlockZeroingOrder(
     __global const ushort  *mayout_pixel_1,
     __global const ushort  *mayout_pixel_2,
 
-    channel_info            mayout_channel_0,
-    channel_info            mayout_channel_1,
-    channel_info            mayout_channel_2,
-    int factor,                                 // ��ǰ���������factor
-    int comp_mask,                              // ��ǰ���������channel
-    float BlockErrorLimit,
+    const channel_info     mayout_channel_0,
+    const channel_info     mayout_channel_1,
+    const channel_info     mayout_channel_2,
+    const int factor,                                 // ��ǰ���������factor
+    const int comp_mask,                              // ��ǰ���������channel
+    const float BlockErrorLimit,
     __global CoeffData *output_order_list/*out*/)
 {
     const int block_x = get_global_id(0);
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 53a89eef..529ca141 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -2,83 +2,80 @@
 #define __CLGUETZLI_CL_H__
 
 #ifdef __cplusplus
+    #define __kernel
+    #define __private
+    #define __global
+    #define __constant
+    typedef unsigned char uchar;
+    typedef unsigned short ushort;
 
-#define __kernel
-#define __private
-#define __global
-#define __constant
-typedef unsigned char uchar;
-typedef unsigned short ushort;
+    int get_global_id(int dim);
+    int get_global_size(int dim);
+    void set_global_id(int dim, int id);
+    void set_global_size(int dim, int size);
 
-int get_global_id(int dim);
-int get_global_size(int dim);
-void set_global_id(int dim, int id);
-void set_global_size(int dim, int size);
-
-#ifdef __opencl
-typedef union ocl_channels_t
-{
-    struct
-    {
-        float * r;
-        float * g;
-        float * b;
-    };
-    union
-    {
-        float *ch[3];
-    };
-}ocl_channels;
-#else
-typedef union ocl_channels_t
-{
-    struct
-    {
-        cl_mem r;
-        cl_mem g;
-        cl_mem b;
-    };
-    struct
-    {
-        cl_mem x;
-        cl_mem y;
-        cl_mem b;
-    };
-    union
+    #ifdef __opencl
+        typedef union ocl_channels_t
+        {
+            struct
+            {
+                float * r;
+                float * g;
+                float * b;
+            };
+            union
+            {
+                float *ch[3];
+            };
+        }ocl_channels;
+    #else
+        typedef union ocl_channels_t
+        {
+            struct
+            {
+                cl_mem r;
+                cl_mem g;
+                cl_mem b;
+            };
+            struct
+            {
+                cl_mem x;
+                cl_mem y;
+                cl_mem b;
+            };
+            union
+            {
+                cl_mem ch[3];
+            };
+        }ocl_channels;
+    #endif
+#else /*__cplusplus*/
+    typedef union ocl_channels_t
     {
-        cl_mem ch[3];
-    };
-}ocl_channels;
+        struct
+        {
+            float * r;
+            float * g;
+            float * b;
+        };
 
-#endif
+        union
+        {
+            float *ch[3];
+        };
+    }ocl_channels;
 
-#else
-typedef union ocl_channels_t
-{
-    struct
-    {
-        float * r;
-        float * g;
-        float * b;
-    };
-
-    union
-    {
-        float *ch[3];
-    };
-}ocl_channels;
+#endif /*__cplusplus*/
 
-#endif
+    typedef short coeff_t;
 
-typedef short coeff_t;
-
-typedef struct __channel_info_t
-{
-    int factor;
-    int block_width;
-    int block_height;
-    __global const coeff_t *coeff;
-    __global const ushort  *pixel;
-}channel_info;
+    typedef struct __channel_info_t
+    {
+        int factor;
+        int block_width;
+        int block_height;
+        __global const coeff_t *coeff;
+        __global const ushort  *pixel;
+    }channel_info;
 
-#endif
\ No newline at end of file
+#endif /*__CLGUETZLI_CL_H__*/
\ No newline at end of file

From 264209c16c356be19ad805858bc80215d8ef627e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 22 May 2017 11:51:11 +0800
Subject: [PATCH 101/189] =?UTF-8?q?const=20=E6=8E=A7=E5=88=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 127 ++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 83 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 0f97ad8e..b9ee9d1e 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -13,7 +13,7 @@ double InterpolateClampNegative(__global const double *array, int size, double s
 void   XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
                                        double r1, double g1, double b1,
                                        double factor, double res[3]);
-double DotProduct(__global float u[3], double v[3]);
+double DotProduct(__global const float u[3], const double v[3]);
 void   OpsinAbsorbance(const double in[3], double out[3]);
 void   RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz);
 double Gamma(double v);
@@ -27,13 +27,13 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_y,
     int xsize,
     int ysize,
-    __global float *r, __global float *g, __global float* b,
-    __global float *r2, __global float* g2, __global float *b2,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
     double* diff_xyb);
 
 __kernel void clOpsinDynamicsImage(
     __global float *r, __global float *g, __global float *b,
-    __global float *r_blurred, __global float *g_blurred, __global float *b_blurred,
+    __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred,
     int size)
 {
     const int i = get_global_id(0);
@@ -60,7 +60,7 @@ __kernel void clOpsinDynamicsImage(
     b[i] = z;
 }
 
-__kernel void clMinSquareVal(__global float* pA, __global float* pC, int square_size, int offset)
+__kernel void clMinSquareVal(__global const float* pA, __global float* pC, int square_size, int offset)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -87,7 +87,7 @@ __kernel void clMinSquareVal(__global float* pA, __global float* pC, int square_
     pC[y * width + x] = minValue;
 }
 
-__kernel void clConvolutionX(__global float* multipliers, __global float* inp, __global float* result,
+__kernel void clConvolutionX(__global const float* multipliers, __global const float* inp, __global float* result,
     int step, int len, int offset, float border_ratio)
 {
     const int x = get_global_id(0);
@@ -125,7 +125,7 @@ __kernel void clConvolutionX(__global float* multipliers, __global float* inp, _
     result[y * xsize + x] = sum * scale;
 }
 
-__kernel void clConvolutionY(__global float* multipliers, __global float* inp, __global float* result,
+__kernel void clConvolutionY(__global const float* multipliers, __global const float* inp, __global float* result,
     int step, int len, int offset, float border_ratio)
 {
     const int x = get_global_id(0);
@@ -164,7 +164,7 @@ __kernel void clConvolutionY(__global float* multipliers, __global float* inp, _
     result[y * xsize + x] = sum * scale;
 }
 
-__kernel void clConvolution(__global float* multipliers, __global float* inp, __global float* result,
+__kernel void clConvolution(__global const float* multipliers, __global const float* inp, __global float* result,
     int xsize, int xstep, int len, int offset, float border_ratio)
 {
     const int ox = get_global_id(0);
@@ -202,7 +202,7 @@ __kernel void clConvolution(__global float* multipliers, __global float* inp, __
     result[ox * ysize + y] = sum * scale;
 }
 
-__kernel void clSquareSample(__global float* pA, __global float* pC, int xstep, int ystep)
+__kernel void clSquareSample(__global const float* pA, __global float* pC, int xstep, int ystep)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -218,7 +218,7 @@ __kernel void clSquareSample(__global float* pA, __global float* pC, int xstep,
     pC[y * xsize + x] = pA[y_sample * xsize + x_sample];
 }
 
-__kernel void clDownSample(__global float* pA, __global float* pC, int xstep, int ystep)
+__kernel void clDownSample(__global const float* pA, __global float* pC, int xstep, int ystep)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -240,7 +240,7 @@ __kernel void clScaleImage(double scale, __global float *result)
     result[i] *= scale;
 }
 
-__kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __global float *out)
+__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -251,7 +251,7 @@ __kernel void clRemoveBorder(__global float *in, int in_xsize, int s, int s2, __
     out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
-__kernel void clAddBorder(__global float *out, int s, int s2, __global float *in)
+__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -264,10 +264,10 @@ __kernel void clAddBorder(__global float *out, int s, int s2, __global float *in
 }
 
 __kernel void clCombineChannels(
-    __global float *mask_x, __global float *mask_y, __global float *mask_b,
-    __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
-    __global float *block_diff_dc,
-    __global float *block_diff_ac,
+    __global const float *mask_x, __global const float *mask_y, __global const float *mask_b,
+    __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b,
+    __global const float *block_diff_dc,
+    __global const float *block_diff_ac,
     __global float *edge_detector_map,
     int xsize, int ysize,
     int res_xsize,
@@ -296,8 +296,8 @@ __kernel void clCombineChannels(
 }
 
 __kernel void clDiffPrecompute(
-    __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
-    __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+    __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b,
+    __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b,
     __global float *mask_x, __global float *mask_y, __global float *mask_b)
 {
     const int x = get_global_id(0);
@@ -362,8 +362,8 @@ __kernel void clDiffPrecompute(
 }
 
 __kernel void clEdgeDetectorMap(__global float *result,
-    __global float *r, __global float *g, __global float* b,
-    __global float *r2, __global float* g2, __global float *b2,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
     int xsize, int ysize, int step)
 {
     const int res_x = get_global_id(0);
@@ -394,8 +394,8 @@ __kernel void clEdgeDetectorMap(__global float *result,
 }
 
 __kernel void clEdgeDetectorLowFreq(__global float *result,
-    __global float *r, __global float *g, __global float* b,
-    __global float *r2, __global float* g2, __global float *b2,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
     int xsize, int ysize, int step)
 {
     const int res_x = get_global_id(0);
@@ -415,8 +415,8 @@ __kernel void clEdgeDetectorLowFreq(__global float *result,
     int ix = pos_y * xsize + pos_x;
 
     double diff[4][3];
-    __global float* blurred0[3] = { r, g, b };
-    __global float* blurred1[3] = { r2, g2, b2 };
+    __global const float* blurred0[3] = { r, g, b };
+    __global const float* blurred1[3] = { r2, g2, b2 };
 
     for (int i = 0; i < 3; ++i) {
         int ix2 = ix + 8;
@@ -459,8 +459,8 @@ __kernel void clEdgeDetectorLowFreq(__global float *result,
 __kernel void clDoMask(
     __global float *mask_x, __global float *mask_y, __global float *mask_b,
     __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
-    __global double *lut_x, __global double *lut_y, __global double *lut_b,
-    __global double *lut_dc_x, __global double *lut_dc_y, __global double *lut_dc_b)
+    __global const double *lut_x, __global const double *lut_y, __global const double *lut_b,
+    __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b)
 {
     const double w00 = 232.206464018;
     const double w11 = 22.9455222245;
@@ -489,8 +489,8 @@ __kernel void clDoMask(
 
 }
 
-__kernel void clBlockDiffMap(__global float* r, __global float* g, __global float* b,
-    __global float* r2, __global float* g2, __global float* b2,
+__kernel void clBlockDiffMap(__global const float* r, __global const float* g, __global const float* b,
+    __global const float* r2, __global const float* g2, __global const float* b2,
     __global float* block_diff_dc, __global float* block_diff_ac,
     int xsize, int ysize, int step)
 {
@@ -549,8 +549,8 @@ __kernel void clBlockDiffMap(__global float* r, __global float* g, __global floa
 __kernel void clMaskHighIntensityChange(
     __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
     __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
-    __global float *c0_x, __global float *c0_y, __global float *c0_b,
-    __global float *c1_x, __global float *c1_y, __global float *c1_b
+    __global const float *c0_x, __global const float *c0_y, __global const float *c0_b,
+    __global const float *c1_x, __global const float *c1_y, __global const float *c1_b
 )
 {
     const int x = get_global_id(0);
@@ -603,7 +603,7 @@ __kernel void clMaskHighIntensityChange(
     xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
 
-__kernel void clUpsampleSquareRoot(__global float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
+__kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
 {
     const int res_x = get_global_id(0);
     const int res_y = get_global_id(1);
@@ -636,7 +636,7 @@ __kernel void clUpsampleSquareRoot(__global float *diffmap, int xsize, int ysize
     }
 }
 
-__kernel void clAverageAddImage(__global float *img, __global float *tmp0, __global float *tmp1)
+__kernel void clAverageAddImage(__global float *img, __global const float *tmp0, __global const float *tmp1)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -694,18 +694,13 @@ __kernel void clAverageAddImage(__global float *img, __global float *tmp0, __glo
     }
 }
 
-
-
-
-
-
 void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_x,
     int pos_y,
     int xsize,
     int ysize,
-    __global float *r, __global float *g, __global float* b,
-    __global float *r2, __global float* g2, __global float *b2,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
     double* diff_xyb)
 {
     int local_count = 0;
@@ -755,13 +750,11 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     }
 }
 
-
-
-double DotProduct(__global float u[3], double v[3]) {
+double DotProduct(__global const float u[3], const double v[3]) {
     return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
 }
 
-double Interpolate(__constant double *array, int size, double sx) {
+double Interpolate(__constant const double *array, const int size, const double sx) {
     double ix = fabs(sx);
 
     int baseix = (int)(ix);
@@ -843,7 +836,6 @@ void XybToVals(
     *valz = zmul * z;
 }
 
-
 #define XybLowFreqToVals_inc 5.2511644570349185
 __constant double XybLowFreqToVals_lut[21] = {
     0,
@@ -882,7 +874,6 @@ void XybLowFreqToVals(double x, double y, double z,
     *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul);
 }
 
-
 double InterpolateClampNegative(__global const double *array,
 	int size, double sx) {
 	if (sx < 0) {
@@ -932,7 +923,6 @@ typedef struct __Complex
 	double imag;
 }Complex;
 
-
 __constant double kSqrtHalf = 0.70710678118654752440084436210484903;
 void RealFFT8(const double* in, Complex* out) {
 	double t1, t2, t3, t5, t6, t7, t8;
@@ -1412,10 +1402,6 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *
     *valz = b;
 }
 
-
-///==================================================
-// ��λ��������Щ��������Ϊ��ʵ��ButteraugliComparatorEx::CompareBlockEx
-
 // IntFloatPair��Ϊ��ģ��output_order input_order��vector�����Ǵ�С�̶�Ϊ8x8
 typedef struct __IntFloatPair
 {
@@ -2015,7 +2001,6 @@ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
 	}
 }
 
-
 void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
 {
     const int block_x = 0;
@@ -2033,28 +2018,6 @@ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
         }
     }
 }
-/*
-void IDCTToPixel(const uchar idct[8*8], ushort pixels_[8*8])
-{
-	const int block_x = 0;
-	const int block_y = 0;
-	const int width_ = 8;
-	const int height_ = 8;
-
-	for (int iy = 0; iy < 8; ++iy)
-	{
-		for (int ix = 0; ix < 8; ++ix)
-		{
-			int x = 8 * block_x + ix;
-			int y = 8 * block_y + iy;
-			if (x >= width_ || y >= height_) continue;
-			int p = y * width_ + x;
-			pixels_[p] = idct[8 * iy + ix] << 4;
-		}
-	}
-}
-*/
-
 
 void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
@@ -2770,8 +2733,8 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float
 
 void Convolution(size_t xsize, size_t ysize,
                  int xstep, int len, int offset,
-                 float* multipliers,
-                 float* inp,
+                 const float* multipliers,
+                 const float* inp,
                  float border_ratio,
                  float* result)
 {
@@ -2802,7 +2765,7 @@ void Convolution(size_t xsize, size_t ysize,
 
 // ian todo
 // �����������output
-void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
+void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
 {
     // �ο�clBlurEx2��ʵ�֣�sigma = 1.1����ʱstep��diff�����ػ�Ϊ�̶�ֵ
 	const double sigma = 1.1;
@@ -2826,10 +2789,9 @@ void BlurEx(float *r, int xsize, int ysize, double kSigma, double border_ratio,
               border_ratio, output);
 }
 
-
 // ian todo
 void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b,
-                            __private float *r_blurred, __private float *g_blurred, __private float *b_blurred,
+                            __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred,
                             int size)
 {
   for (size_t i = 0; i < size; ++i) {
@@ -2860,8 +2822,8 @@ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private f
 // chrisk todo
 void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     float *xyb1_x, float *xyb1_y, float *xyb1_b,
-    float *c0_x, float *c0_y, float *c0_b,
-    float *c1_x, float *c1_y, float *c1_b,
+    const float *c0_x, const float *c0_y, const float *c0_b,
+    const float *c1_x, const float *c1_y, const float *c1_b,
     int xsize, int ysize)
 {
     for (int x = 0; x < xsize; ++x)
@@ -2925,7 +2887,7 @@ void floatcopy(float *dst, const float *src, int size)
     }
 }
 
-void coeffcopy_g(coeff_t *dst, const __global coeff_t *src, int size)
+void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -3015,8 +2977,7 @@ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8]
 
 int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
                  const __global float *orig_image_batch,
-                 int width_,
-                 int height_,
+                 int width_, int height_,
                  int block_x, int block_y,
                  int factor,
                  int off_x, int off_y)

From ea15082c300d7293de92113cfb7ec3a8be10404e Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Mon, 22 May 2017 13:10:39 +0800
Subject: [PATCH 102/189] Fix Average5x5

---
 clguetzli/clguetzli.cl       | 72 +++++++++++++-----------------------
 clguetzli/clguetzli_test.cpp |  6 +--
 2 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index b9ee9d1e..ec745d36 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -644,54 +644,34 @@ __kernel void clAverageAddImage(__global float *img, __global const float *tmp0,
     const int ysize = get_global_size(1);
 
     const int row0 = y * xsize;
-    if (x == 0) // excute once per y
-    {
-        img[row0 + 1] += tmp0[row0];
-        img[row0 + 0] += tmp0[row0 + 1];
-        img[row0 + 2] += tmp0[row0 + 1];
-
-        img[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
-        img[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
-        img[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
-
-        if (y > 0) {
-            const int rowd1 = row0 - xsize;
-            img[rowd1 + 1] += tmp1[row0];
-            img[rowd1 + 0] += tmp0[row0];
-
-            img[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
-            img[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
-        }
-        if (y + 1 < ysize) {
-            const int rowu1 = row0 + xsize;
-            img[rowu1 + 1] += tmp1[row0];
-            img[rowu1 + 0] += tmp0[row0];
-
-            img[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
-            img[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
-        }
-    }
+	if (x - 1 >= 0) {
+		img[row0 + x] += tmp0[row0 + x - 1];
+	}
+	if (x + 1 < xsize) {
+		img[row0 + x] += tmp0[row0 + x + 1];
+	}
 
-    if (x >= 2 && x < xsize - 2)
-    {
-        img[row0 + x - 1] += tmp0[row0 + x];
-        img[row0 + x + 1] += tmp0[row0 + x];
-    }
+	if (y > 0) {
+		const int rowd1 = row0 - xsize;
+		if (x - 1 >= 0) {
+			img[row0 + x] += tmp1[rowd1 + x - 1];
+		}
+		img[row0 + x] += tmp0[rowd1 + x];
+		if (x + 1 < xsize) {
+			img[row0 + x] += tmp1[rowd1 + x + 1];
+		}
+	}
 
-    if (x >= 1 && x < xsize - 1) {
-        if (y > 0) {
-            const int rowd1 = row0 - xsize;
-            img[rowd1 + x + 1] += tmp1[row0 + x];
-            img[rowd1 + x + 0] += tmp0[row0 + x];
-            img[rowd1 + x - 1] += tmp1[row0 + x];
-        }
-        if (y + 1 < ysize) {
-            const int rowu1 = row0 + xsize;
-            img[rowu1 + x + 1] += tmp1[row0 + x];
-            img[rowu1 + x + 0] += tmp0[row0 + x];
-            img[rowu1 + x - 1] += tmp1[row0 + x];
-        }
-    }
+	if (y + 1 < ysize) {
+		const int rowu1 = row0 + xsize;
+		if (x - 1 >= 0) {
+			img[row0 + x] += tmp1[rowu1 + x - 1];
+		}
+		img[row0 + x] += tmp0[rowu1 + x];
+		if (x + 1 < xsize) {
+			img[row0 + x] += tmp1[rowu1 + x + 1];
+		}
+	}
 }
 
 void Butteraugli8x8CornerEdgeDetectorDiff(
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 6dca483f..9d3d05a7 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -454,12 +454,12 @@ void tclDiffPrecompute(
   FLOAT_COMPARE(r_y, (*mask_cmp)[1].data(), xsize * ysize);
   FLOAT_COMPARE(r_b, (*mask_cmp)[2].data(), xsize * ysize);
 
-  ocl.releaseMemChannels(cl_xyb0);
-  ocl.releaseMemChannels(cl_xyb1);
-  ocl.releaseMemChannels(cl_mask);
   clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.x, r_x, 0, NULL, NULL);
   clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.y, r_y, 0, NULL, NULL);
   clEnqueueUnmapMemObject(ocl.commandQueue, cl_mask.b, r_b, 0, NULL, NULL);
+  ocl.releaseMemChannels(cl_xyb0);
+  ocl.releaseMemChannels(cl_xyb1);
+  ocl.releaseMemChannels(cl_mask);
 }
 
 // ian todo

From 64968862268f04d82ff46c68bc8be7f5df892346 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Mon, 22 May 2017 14:53:42 +0800
Subject: [PATCH 103/189] Inline ScaleIamge in kernel Average5x5

---
 clguetzli/clguetzli.cl  | 22 +++++++------
 clguetzli/clguetzli.cpp | 68 ++++++++++++-----------------------------
 clguetzli/ocl.h         |  2 +-
 3 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index ec745d36..152961dc 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -636,42 +636,46 @@ __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int
     }
 }
 
-__kernel void clAverageAddImage(__global float *img, __global const float *tmp0, __global const float *tmp1)
+__kernel void clAverage5x5(__global float *img, __global const float *img_org)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
     const int xsize = get_global_size(0);
     const int ysize = get_global_size(1);
 
+	const float w = 0.679144890667f;
+	const float scale = 1.0f / (5.0f + 4 * w);
     const int row0 = y * xsize;
 	if (x - 1 >= 0) {
-		img[row0 + x] += tmp0[row0 + x - 1];
+		img[row0 + x] += img_org[row0 + x - 1];
 	}
 	if (x + 1 < xsize) {
-		img[row0 + x] += tmp0[row0 + x + 1];
+		img[row0 + x] += img_org[row0 + x + 1];
 	}
 
 	if (y > 0) {
 		const int rowd1 = row0 - xsize;
 		if (x - 1 >= 0) {
-			img[row0 + x] += tmp1[rowd1 + x - 1];
+			img[row0 + x] += img_org[rowd1 + x - 1] * w;
 		}
-		img[row0 + x] += tmp0[rowd1 + x];
+		img[row0 + x] += img_org[rowd1 + x];
 		if (x + 1 < xsize) {
-			img[row0 + x] += tmp1[rowd1 + x + 1];
+			img[row0 + x] += img_org[rowd1 + x + 1] * w;
 		}
 	}
 
 	if (y + 1 < ysize) {
 		const int rowu1 = row0 + xsize;
 		if (x - 1 >= 0) {
-			img[row0 + x] += tmp1[rowu1 + x - 1];
+			img[row0 + x] += img_org[rowu1 + x - 1] * w;
 		}
-		img[row0 + x] += tmp0[rowu1 + x];
+		img[row0 + x] += img_org[rowu1 + x];
 		if (x + 1 < xsize) {
-			img[row0 + x] += tmp1[rowu1 + x + 1];
+			img[row0 + x] += img_org[rowu1 + x + 1] * w;
 		}
 	}
+
+	img[row0 + x] *= scale;
 }
 
 void Butteraugli8x8CornerEdgeDetectorDiff(
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 67eb4918..6be1e014 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -59,7 +59,7 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRoot", &err);
 	ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorder", &err);
 	ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorder", &err);
-	ocl.kernel[KERNEL_AVERAGEADDIMAGE] = clCreateKernel(ocl.program, "clAverageAddImage", &err);
+	ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5", &err);
 	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
 	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
 	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
@@ -642,29 +642,6 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 	}
 }
 
-void clAverageAddImage(cl_mem img, cl_mem tmp0, cl_mem tmp1, size_t xsize, size_t ysize)
-{
-	cl_int err = CL_SUCCESS;
-	ocl_args_d_t &ocl = getOcl();
-
-	cl_kernel kernel = ocl.kernel[KERNEL_AVERAGEADDIMAGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp0);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&tmp1);
-
-	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clAverageAddImage() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clAverageAddImage() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
-}
-
 void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
 {
 	if (xsize < 4 || ysize < 4) {
@@ -677,30 +654,25 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
 
 	size_t len = xsize * ysize * sizeof(float);
 	ocl.allocA(len);
-	ocl.allocB(len);
-	ocl.allocC(len);
-	cl_mem result = ocl.srcA;
-	cl_mem tmp0 = ocl.srcB;
-	cl_mem tmp1 = ocl.dstMem;
-
-	err = clEnqueueCopyBuffer(ocl.commandQueue, img, result, 0, 0, len, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp0, 0, 0, len, 0, NULL, NULL);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp1, 0, 0, len, 0, NULL, NULL);
-
-	static const float w = 0.679144890667f;
-	static const float scale = 1.0f / (5.0f + 4 * w);
-
-	clScaleImageEx(tmp1, xsize * ysize, w);
-	clAverageAddImage(result, tmp0, tmp1, xsize, ysize);
-
-	err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, len, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clAverage5x5Ex() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-
-	clScaleImageEx(img, xsize * ysize, scale);
+	cl_mem tmp = ocl.srcA;
+
+	err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp, 0, 0, len, 0, NULL, NULL);
+
+  cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp);
+
+  size_t globalWorkSize[2] = { xsize, ysize };
+  err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+  if (CL_SUCCESS != err)
+  {
+    LogError("Error: clAverage5x5Ex() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+  }
+  err = clFinish(ocl.commandQueue);
+  if (CL_SUCCESS != err)
+  {
+    LogError("Error: clAverage5x5Ex() clFinish returned %s.\n", TranslateOpenCLError(err));
+  }
 }
 
 void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 15e115af..b74a8a58 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -60,7 +60,7 @@ enum KernelName {
 	KERNEL_UPSAMPLESQUAREROOT,
 	KERNEL_ADDBORDER,
 	KERNEL_REMOVEBORDER,
-	KERNEL_AVERAGEADDIMAGE,
+  KERNEL_AVERAGE5X5,
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
 	KERNEL_EDGEDETECTORLOWFREQ,

From 89cda39c53d9281f5f1fe6d86e3118d408671f92 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Mon, 22 May 2017 15:38:01 +0800
Subject: [PATCH 104/189] Avoid const value computing in work item

---
 clguetzli/clguetzli.cl | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 152961dc..5548e7a1 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -462,16 +462,16 @@ __kernel void clDoMask(
     __global const double *lut_x, __global const double *lut_y, __global const double *lut_b,
     __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b)
 {
-    const double w00 = 232.206464018;
-    const double w11 = 22.9455222245;
-    const double w22 = 503.962310606;
-
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
     const int xsize = get_global_size(0);
     const int ysize = get_global_size(1);
 
+	const double w00 = 232.206464018;
+	const double w11 = 22.9455222245;
+	const double w22 = 503.962310606;
+
     const size_t idx = y * xsize + x;
     const double s0 = mask_x[idx];
     const double s1 = mask_y[idx];
@@ -636,6 +636,8 @@ __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int
     }
 }
 
+#define Average5x5_w 0.679144890667f
+__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w);
 __kernel void clAverage5x5(__global float *img, __global const float *img_org)
 {
     const int x = get_global_id(0);
@@ -643,8 +645,6 @@ __kernel void clAverage5x5(__global float *img, __global const float *img_org)
     const int xsize = get_global_size(0);
     const int ysize = get_global_size(1);
 
-	const float w = 0.679144890667f;
-	const float scale = 1.0f / (5.0f + 4 * w);
     const int row0 = y * xsize;
 	if (x - 1 >= 0) {
 		img[row0 + x] += img_org[row0 + x - 1];
@@ -656,26 +656,26 @@ __kernel void clAverage5x5(__global float *img, __global const float *img_org)
 	if (y > 0) {
 		const int rowd1 = row0 - xsize;
 		if (x - 1 >= 0) {
-			img[row0 + x] += img_org[rowd1 + x - 1] * w;
+			img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w;
 		}
 		img[row0 + x] += img_org[rowd1 + x];
 		if (x + 1 < xsize) {
-			img[row0 + x] += img_org[rowd1 + x + 1] * w;
+			img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w;
 		}
 	}
 
 	if (y + 1 < ysize) {
 		const int rowu1 = row0 + xsize;
 		if (x - 1 >= 0) {
-			img[row0 + x] += img_org[rowu1 + x - 1] * w;
+			img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w;
 		}
 		img[row0 + x] += img_org[rowu1 + x];
 		if (x + 1 < xsize) {
-			img[row0 + x] += img_org[rowu1 + x + 1] * w;
+			img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w;
 		}
 	}
 
-	img[row0 + x] *= scale;
+	img[row0 + x] *= Average5x5_scale;
 }
 
 void Butteraugli8x8CornerEdgeDetectorDiff(

From f54bc0ea0177358e818389f3631ee6042abe3376 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Mon, 22 May 2017 15:42:00 +0800
Subject: [PATCH 105/189] Fix tclCalculateDiffmap

---
 clguetzli/clguetzli_test.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 9d3d05a7..1797fe66 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -321,10 +321,10 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	cl_mem mem_diffmap = ocl.allocMem(length);
 	clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL);
 	clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step);
-	//cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err);
-  //err = clFinish(ocl.commandQueue);
-	//FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize);
-  //clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL);
+	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err);
+  err = clFinish(ocl.commandQueue);
+	FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize);
+  clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL);
 	clReleaseMemObject(mem_diffmap);
 }
 

From 36f2e52e8c7d00494934c8258a9f1042f8abf1c2 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 24 May 2017 11:49:57 +0800
Subject: [PATCH 106/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clbutter_comparator.h               |   2 +-
 clguetzli/clguetzli.cl                        |   8 +-
 clguetzli/clguetzli.cl.h                      |   2 +-
 clguetzli/clguetzli.cpp                       |   4 +-
 clguetzli/clguetzli.h                         |   6 +-
 clguetzli/ocl.cpp                             |   6 +
 clguetzli/ocl.h                               |   2 +-
 clguetzli/utils.cpp                           |  11 +-
 guetzli.make                                  | 190 ++++++++++++++++--
 guetzli/butteraugli_comparator.h              |   2 +-
 guetzli/guetzli.cc                            |   2 +-
 guetzli/processor.cc                          |   2 +-
 guetzli_static.make                           | 181 +++++++++++++++--
 premake5.lua                                  |  12 +-
 .../butteraugli/butteraugli/butteraugli.cc    |   6 +-
 15 files changed, 374 insertions(+), 62 deletions(-)

diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h
index 19ca163f..41e332ed 100644
--- a/clguetzli/clbutter_comparator.h
+++ b/clguetzli/clbutter_comparator.h
@@ -1,6 +1,6 @@
 #pragma once
 #include <vector>
-#include "butteraugli\butteraugli.h"
+#include "butteraugli/butteraugli.h"
 
 #define __restrict__
 
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 5548e7a1..9722b08d 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,6 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-#include  "clguetzli\clguetzli.cl.h"
+#include  "clguetzli/clguetzli.cl.h"
 
 #define kBlockEdge      8
 #define kBlockSize      (kBlockEdge * kBlockEdge)
@@ -258,6 +258,12 @@ __kernel void clAddBorder(__global float *out, int s, int s2, __global const flo
     const int xsize = get_global_size(0);
     const int ysize = get_global_size(1);
 
+	if (x >= xsize - s ||
+	    y >= ysize - s)
+	{
+		return;
+	}
+
     const double mul1 = 24.8235314874;
     out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x];
 
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 529ca141..35b4ed3c 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -41,7 +41,7 @@
             {
                 cl_mem x;
                 cl_mem y;
-                cl_mem b;
+                cl_mem b_;
             };
             union
             {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 6be1e014..b12bdfc4 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -22,7 +22,7 @@ ocl_args_d_t& getOcl(void)
 
 	char* source = nullptr;
 	size_t src_size = 0;
-	ReadSourceFromFile("clguetzli\\clguetzli.cl", &source, &src_size);
+	ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
 
 	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
 
@@ -1060,7 +1060,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&cls2);
 	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in);
 
-	size_t globalWorkSize[2] = { xsize - cls, ysize - cls };
+	size_t globalWorkSize[2] = { xsize, ysize};
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 4e6f3209..3235103e 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <vector>
-#include "CL\cl.h"
-#include "guetzli\processor.h"
-#include "guetzli\butteraugli_comparator.h"
+#include "CL/cl.h"
+#include "guetzli/processor.h"
+#include "guetzli/butteraugli_comparator.h"
 #include "ocl.h"
 #include "clguetzli.cl.h"
 
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 7be57d49..517b42c3 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -1,4 +1,10 @@
 #include "ocl.h"
+#include <string.h>
+#ifdef __linux__
+#include <malloc.h>
+#define _aligned_malloc memalign
+#define _aligned_free free
+#endif
 #include <vector>
 
 
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index b74a8a58..d72000b3 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "CL\cl.h"
+#include "CL/cl.h"
 #include "utils.h"
 #include "clguetzli.cl.h"
 
diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp
index 24520cd8..4fc8dbc2 100644
--- a/clguetzli/utils.cpp
+++ b/clguetzli/utils.cpp
@@ -22,11 +22,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <tchar.h>
 #include <memory.h>
-#include <windows.h>
-#include "CL\cl.h"
-#include "CL\cl_ext.h"
+#include <stdarg.h>
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
 #include "utils.h"
 #include <assert.h>
 
@@ -70,7 +69,11 @@ int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
     int errorCode = CL_SUCCESS;
 
     FILE* fp = NULL;
+#ifdef __linux__
+    fp = fopen(fileName, "rb");
+#else
     fopen_s(&fp, fileName, "rb");
+#endif
     if (fp == NULL)
     {
         LogError("Error: Couldn't find program source file '%s'.\n", fileName);
diff --git a/guetzli.make b/guetzli.make
index 7edeea3f..442d678b 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -16,15 +16,15 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS +=
+  LIBS += -lOpenCL
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
   define PREBUILDCMDS
   endef
@@ -32,7 +32,7 @@ ifeq ($(config),release)
   endef
   define POSTBUILDCMDS
   endef
-all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
+all: prebuild prelink $(TARGET)
 	@:
 
 endif
@@ -43,15 +43,15 @@ ifeq ($(config),debug)
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Debug/guetzli
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS +=
+  LIBS += -lOpenCL
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
   define PREBUILDCMDS
   endef
@@ -59,12 +59,18 @@ ifeq ($(config),debug)
   endef
   define POSTBUILDCMDS
   endef
-all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
+all: prebuild prelink $(TARGET)
 	@:
 
 endif
 
 OBJECTS := \
+	$(OBJDIR)/clbutter_comparator.o \
+	$(OBJDIR)/clguetzli.cl.o \
+	$(OBJDIR)/clguetzli.o \
+	$(OBJDIR)/clguetzli_test.o \
+	$(OBJDIR)/ocl.o \
+	$(OBJDIR)/utils.o \
 	$(OBJDIR)/butteraugli_comparator.o \
 	$(OBJDIR)/dct_double.o \
 	$(OBJDIR)/debug_print.o \
@@ -101,24 +107,13 @@ endif
 
 $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES)
 	@echo Linking guetzli
-	$(SILENT) $(LINKCMD)
-	$(POSTBUILDCMDS)
-
-$(TARGETDIR):
-	@echo Creating $(TARGETDIR)
 ifeq (posix,$(SHELLTYPE))
 	$(SILENT) mkdir -p $(TARGETDIR)
 else
 	$(SILENT) mkdir $(subst /,\\,$(TARGETDIR))
 endif
-
-$(OBJDIR):
-	@echo Creating $(OBJDIR)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
+	$(SILENT) $(LINKCMD)
+	$(POSTBUILDCMDS)
 
 clean:
 	@echo Cleaning guetzli
@@ -143,68 +138,221 @@ $(GCH): $(PCH)
 	$(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<"
 endif
 
+$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocl.o: clguetzli/ocl.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/utils.o: clguetzli/utils.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/dct_double.o: guetzli/dct_double.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/debug_print.o: guetzli/debug_print.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/fdct.o: guetzli/fdct.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/guetzli.o: guetzli/guetzli.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/idct.o: guetzli/idct.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/output_image.o: guetzli/output_image.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/processor.o: guetzli/processor.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quality.o: guetzli/quality.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quantize.o: guetzli/quantize.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/score.o: guetzli/score.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 
 -include $(OBJECTS:%.o=%.d)
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
index 5418c0d2..5fd140ba 100644
--- a/guetzli/butteraugli_comparator.h
+++ b/guetzli/butteraugli_comparator.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "butteraugli/butteraugli.h"
-#include "clguetzli\clbutter_comparator.h"
+#include "clguetzli/clbutter_comparator.h"
 #include "guetzli/comparator.h"
 #include "guetzli/jpeg_data.h"
 #include "guetzli/output_image.h"
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 5982bc1c..40544d90 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -28,7 +28,7 @@
 #include "guetzli/processor.h"
 #include "guetzli/quality.h"
 #include "guetzli/stats.h"
-#include "clguetzli\clguetzli.h"
+#include "clguetzli/clguetzli.h"
 
 namespace {
 
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 43b513dc..e5439460 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -31,7 +31,7 @@
 #include "guetzli/jpeg_data_writer.h"
 #include "guetzli/output_image.h"
 #include "guetzli/quantize.h"
-#include "clguetzli\clguetzli.h"
+#include "clguetzli/clguetzli.h"
 
 namespace guetzli {
 
diff --git a/guetzli_static.make b/guetzli_static.make
index d20fb77d..f271c46f 100644
--- a/guetzli_static.make
+++ b/guetzli_static.make
@@ -16,7 +16,7 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/libguetzli_static.a
   OBJDIR = obj/Release/guetzli_static
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --static --cflags libpng || libpng-config --static --cflags`
@@ -24,7 +24,7 @@ ifeq ($(config),release)
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
   LIBS +=
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags`
   LINKCMD = $(AR) -rcs "$@" $(OBJECTS)
   define PREBUILDCMDS
   endef
@@ -32,7 +32,7 @@ ifeq ($(config),release)
   endef
   define POSTBUILDCMDS
   endef
-all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
+all: prebuild prelink $(TARGET)
 	@:
 
 endif
@@ -43,7 +43,7 @@ ifeq ($(config),debug)
   TARGET = $(TARGETDIR)/libguetzli_static.a
   OBJDIR = obj/Debug/guetzli_static
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --static --cflags libpng || libpng-config --static --cflags`
@@ -51,7 +51,7 @@ ifeq ($(config),debug)
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
   LIBS +=
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags`
   LINKCMD = $(AR) -rcs "$@" $(OBJECTS)
   define PREBUILDCMDS
   endef
@@ -59,12 +59,18 @@ ifeq ($(config),debug)
   endef
   define POSTBUILDCMDS
   endef
-all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
+all: prebuild prelink $(TARGET)
 	@:
 
 endif
 
 OBJECTS := \
+	$(OBJDIR)/clbutter_comparator.o \
+	$(OBJDIR)/clguetzli.cl.o \
+	$(OBJDIR)/clguetzli.o \
+	$(OBJDIR)/clguetzli_test.o \
+	$(OBJDIR)/ocl.o \
+	$(OBJDIR)/utils.o \
 	$(OBJDIR)/butteraugli_comparator.o \
 	$(OBJDIR)/dct_double.o \
 	$(OBJDIR)/debug_print.o \
@@ -100,24 +106,13 @@ endif
 
 $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES)
 	@echo Linking guetzli_static
-	$(SILENT) $(LINKCMD)
-	$(POSTBUILDCMDS)
-
-$(TARGETDIR):
-	@echo Creating $(TARGETDIR)
 ifeq (posix,$(SHELLTYPE))
 	$(SILENT) mkdir -p $(TARGETDIR)
 else
 	$(SILENT) mkdir $(subst /,\\,$(TARGETDIR))
 endif
-
-$(OBJDIR):
-	@echo Creating $(OBJDIR)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
+	$(SILENT) $(LINKCMD)
+	$(POSTBUILDCMDS)
 
 clean:
 	@echo Cleaning guetzli_static
@@ -142,65 +137,213 @@ $(GCH): $(PCH)
 	$(SILENT) $(CXX) -x c++-header $(ALL_CXXFLAGS) -o "$@" -MF "$(@:%.gch=%.d)" -c "$<"
 endif
 
+$(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocl.o: clguetzli/ocl.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/utils.o: clguetzli/utils.cpp
+	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/dct_double.o: guetzli/dct_double.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/debug_print.o: guetzli/debug_print.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/fdct.o: guetzli/fdct.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/idct.o: guetzli/idct.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/output_image.o: guetzli/output_image.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/processor.o: guetzli/processor.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quality.o: guetzli/quality.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quantize.o: guetzli/quantize.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/score.o: guetzli/score.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc
 	@echo $(notdir $<)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 
 -include $(OBJECTS:%.o=%.d)
diff --git a/premake5.lua b/premake5.lua
index 1a109d7a..18f5ecee 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -2,7 +2,8 @@ workspace "guetzli"
   configurations { "Release", "Debug" }
   language "C++"
   flags { "C++11" }
-  includedirs { ".", "third_party/butteraugli" }
+  includedirs { ".", "third_party/butteraugli", "clguetzli", "$(OPENCL_INC)" }
+  libdirs { "$(OPENCL_LIB)" }
 
   filter "action:vs*"
     platforms { "x86_64", "x86" }
@@ -29,7 +30,9 @@ workspace "guetzli"
         "guetzli/*.cc",
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
-        "third_party/butteraugli/butteraugli/butteraugli.h"
+        "third_party/butteraugli/butteraugli/butteraugli.h",
+        "clguetzli/*.cpp",
+        "clguetzli/*.h"
       }
     removefiles "guetzli/guetzli.cc"
     filter "action:gmake"
@@ -41,6 +44,7 @@ workspace "guetzli"
     filter "action:gmake"
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
+      links { "OpenCL" }
     filter "action:vs*"
       links { "shlwapi" }
     filter {}
@@ -49,5 +53,7 @@ workspace "guetzli"
         "guetzli/*.cc",
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
-        "third_party/butteraugli/butteraugli/butteraugli.h"
+        "third_party/butteraugli/butteraugli/butteraugli.h",
+        "clguetzli/*.cpp",
+        "clguetzli/*.h"
       }
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 69511051..1b2c16f7 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -40,9 +40,9 @@
 #include <algorithm>
 #include <array>
 
-#include "clguetzli\clbutter_comparator.h"
-#include "clguetzli\clguetzli.h"
-#include "clguetzli\clguetzli_test.h"
+#include "clguetzli/clbutter_comparator.h"
+#include "clguetzli/clguetzli.h"
+#include "clguetzli/clguetzli_test.h"
 
 // Restricted pointers speed up Convolution(); MSVC uses a different keyword.
 #ifdef _MSC_VER

From ec42b7b6f02baecc185a161070dbc33fd72750b7 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 24 May 2017 14:39:50 +0800
Subject: [PATCH 107/189] const control

---
 clguetzli/clguetzli.cpp      | 66 +++++++++++++-----------------------
 clguetzli/clguetzli.h        | 39 ++++++++++-----------
 clguetzli/clguetzli_test.cpp | 32 ++++++++---------
 clguetzli/clguetzli_test.h   | 33 ++++++++----------
 clguetzli/ocl.cpp            | 12 ++-----
 clguetzli/ocl.h              |  4 +--
 6 files changed, 80 insertions(+), 106 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index b12bdfc4..f6618d3c 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -251,10 +251,7 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
 	if (xstep > 1)
 	{
@@ -272,8 +269,8 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 
 	clReleaseMemObject(mem_expn);
 }
-void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
-              double sigma, double border_ratio,
+void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+              const double sigma, const double border_ratio,
               cl_mem result/*out, opt*/)
 {
 	clBlurEx2(image, xsize, ysize, sigma, border_ratio, result);
@@ -296,10 +293,7 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_expn, CL_FALSE, 0, sizeof(cl_float) * expn_size, expn.data(), 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
 	if (xstep > 1)
 	{
@@ -320,7 +314,7 @@ void clBlurEx(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 	clReleaseMemObject(mem_expn);
 }
 
-void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize)
+void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize)
 {
 	static const double kSigma = 1.1;
 
@@ -359,18 +353,13 @@ void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysi
 	ocl.releaseMemChannels(rgb_blurred);
 }
 
-void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b)
+void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-    ocl_channels rgb = ocl.allocMemChannels(channel_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
 
 	clOpsinDynamicsImageEx(rgb, xsize, ysize);
 
@@ -392,9 +381,9 @@ void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float*
     ocl.releaseMemChannels(rgb);
 }
 
-void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
-								 ocl_channels xyb1/*in,out*/,
-                                 size_t xsize, size_t ysize)
+void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
+								 ocl_channels &xyb1/*in,out*/,
+                                 const size_t xsize, const size_t ysize)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
@@ -442,7 +431,8 @@ void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
 	ocl.releaseMemChannels(c1);
 }
 
-void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/)
+void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
@@ -495,8 +485,8 @@ void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size
 	ocl.releaseMemChannels(rgb2_blured);
 }
 
-void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize, size_t step,
+void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step,
 	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/)
 {
 	cl_int err = CL_SUCCESS;
@@ -535,8 +525,8 @@ void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
 	}
 }
 
-void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize, size_t step,
+void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step,
 	cl_mem block_diff_ac/*out*/)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
@@ -851,8 +841,8 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 }
 
 
-void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize,
+void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize,
 	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/)
 {
     clDiffPrecomputeEx(rgb, rgb2, xsize, ysize, mask);
@@ -927,8 +917,8 @@ void clMask(const float* r, const float* g, const float* b,
 }
 
 void clCombineChannelsEx(
-	ocl_channels mask,
-	ocl_channels mask_dc,
+	const ocl_channels &mask,
+	const ocl_channels &mask_dc,
 	cl_mem block_diff_dc,
 	cl_mem block_diff_ac,
 	cl_mem edge_detector_map,
@@ -1098,7 +1088,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 }
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
-								 float* r2, float* g2, float* b2,
+								 const float* r2, const float* g2, const float* b2,
 								 size_t xsize, size_t ysize,
 								 size_t step,
 								 float* result)
@@ -1112,18 +1102,10 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
-	cl_mem mem_result = ocl.allocMem(channel_size);
+    cl_mem mem_result = ocl.allocMem(channel_size);
 	const float pattern = 0;
 	clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL);
 	clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, channel_step_size, result, 0, NULL, NULL);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 3235103e..4d4a2fcf 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -9,12 +9,12 @@
 extern bool g_useOpenCL;
 extern bool g_checkOpenCL;
 
-void clOpsinDynamicsImage(size_t xsize, size_t ysize, float* r, float* g, float* b);
+void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b);
 
 void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
-    float* r2, float* g2, float* b2,
-    size_t xsize, size_t ysize,
-    size_t step,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step,
     float* result);
 
 void clComputeBlockZeroingOrder(
@@ -31,35 +31,36 @@ void clComputeBlockZeroingOrder(
 
 void clMask(const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2,
-    size_t xsize, size_t ysize,
+    const size_t xsize, const size_t ysize,
     float* mask_r, float* mask_g, float* mask_b,
     float* maskdc_r, float* maskdc_g, float* maskdc_b);
 
-void clMaskHighIntensityChangeEx(ocl_channels xyb0/*in,out*/,
-	ocl_channels xyb1/*in,out*/,
-	size_t xsize, size_t ysize);
+void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
+	ocl_channels &xyb1/*in,out*/,
+	const size_t xsize, const size_t ysize);
 
-void clMaskEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize,
+void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize,
 	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/);
 
-void clEdgeDetectorMapEx(ocl_channels rgb, ocl_channels rgb2, size_t xsize, size_t ysize, size_t step, cl_mem result/*out*/);
+void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/);
 
-void clBlockDiffMapEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize, size_t step,
+void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step,
 	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/);
 
-void clEdgeDetectorLowFreqEx(ocl_channels rgb, ocl_channels rgb2,
-	size_t xsize, size_t ysize, size_t step,
+void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step,
 	cl_mem block_diff_ac/*in,out*/);
 
-void clBlurEx(cl_mem image, size_t xsize, size_t ysize, double sigma, double border_ratio, cl_mem result = nullptr);
+void clBlurEx(cl_mem image, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, cl_mem result = nullptr);
 
-void clOpsinDynamicsImageEx(ocl_channels rgb/*in,out*/, size_t xsize, size_t ysize);
+void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize);
 
 void clCombineChannelsEx(
-	ocl_channels mask,
-	ocl_channels mask_dc,
+	const ocl_channels &mask,
+	const ocl_channels &mask_dc,
 	cl_mem block_diff_dc,
 	cl_mem block_diff_ac,
 	cl_mem edge_detector_map,
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 1797fe66..89d86bb4 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -267,8 +267,8 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	size_t xsize, size_t ysize,
 	size_t res_xsize, size_t res_ysize,
 	size_t step,
-	float *init_result,
-	float *result)
+	const float *init_result,
+	const float *result)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -311,8 +311,8 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 // ian todo
 void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
-	float *diffmap, size_t org_len,
-	float *diffmap_cmp)
+	const float *diffmap, size_t org_len,
+	const float *diffmap_cmp)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -329,7 +329,7 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 }
 
 // chrisk todo
-void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result)
+void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
     cl_int err = 0;
@@ -423,10 +423,10 @@ void tclUpsample(float* image, size_t xsize, size_t ysize,
 
 // ian todo
 void tclDiffPrecompute(
-  const std::vector<std::vector<float> > &xyb0,
-  const std::vector<std::vector<float> > &xyb1,
-	size_t xsize, size_t ysize,
-  std::vector<std::vector<float> > *mask_cmp)
+  const const std::vector<std::vector<float> > &xyb0,
+  const const std::vector<std::vector<float> > &xyb1,
+  size_t xsize, size_t ysize,
+  const std::vector<std::vector<float> > *mask_cmp)
 {
   cl_int err = 0;
   ocl_args_d_t &ocl = getOcl();
@@ -463,7 +463,7 @@ void tclDiffPrecompute(
 }
 
 // ian todo
-void tclAverage5x5(int xsize, int ysize, std::vector<float> &diffs_org, std::vector<float> &diffs_cmp)
+void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, const std::vector<float> &diffs_cmp)
 {
   cl_int err = 0;
   ocl_args_d_t &ocl = getOcl();
@@ -479,9 +479,9 @@ void tclAverage5x5(int xsize, int ysize, std::vector<float> &diffs_org, std::vec
 }
 
 // chrisk todo
-void tclMinSquareVal(float *img, size_t square_size, size_t offset,
+void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
-	float *values)
+	const float *result)
 {
 	size_t img_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
@@ -496,7 +496,7 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset,
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, img_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
 
-	FLOAT_COMPARE(values, r_r, xsize * ysize);
+	FLOAT_COMPARE(result, r_r, xsize * ysize);
 
 	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
@@ -504,7 +504,7 @@ void tclMinSquareVal(float *img, size_t square_size, size_t offset,
 	clReleaseMemObject(r);
 }
 
-void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length)
+void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length)
 {
   cl_int err = 0;
   ocl_args_d_t &ocl = getOcl();
@@ -522,8 +522,8 @@ void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t le
 }
 
 // strong todo
-void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
-	float* result_r, float* result_g, float* result_b)
+void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b)
 {
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 226d3d0a..a84b94ac 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -9,7 +9,7 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* result_r, const float* result_g, const float* result_b,
 	const float* result_r2, const float* result_g2, const float* result_b2);
 
-void tclBlur(float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, float* result);
+void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result);
 
 void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
@@ -40,13 +40,13 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	size_t xsize, size_t ysize,
 	size_t res_xsize, size_t res_ysize,
 	size_t step,
-	float *init_result,
-	float *result);
+	const float *init_result,
+	const float *result);
 
 void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
-	float *diffmap, size_t org_len,
-	float *diffmap_cmp);
+	const float *diffmap, size_t org_len,
+	const float *diffmap_cmp);
 
 void tclConvolution(size_t xsize, size_t ysize,
 	size_t xstep,
@@ -56,26 +56,23 @@ void tclConvolution(size_t xsize, size_t ysize,
 	float border_ratio,
 	float* result);
 
-void tclBlur(size_t xsize, size_t ysize, float* channel, double sigma,
-	double border_ratio);
-
 void tclDiffPrecompute(
   const std::vector<std::vector<float> > &xyb0,
   const std::vector<std::vector<float> > &xyb1,
-	size_t xsize, size_t ysize,
-  std::vector<std::vector<float> > *mask_cmp);
+  size_t xsize, size_t ysize,
+  const std::vector<std::vector<float> > *mask_cmp);
 
-void tclAverage5x5(int xsize, int ysize, std::vector<float> &diffs_org, std::vector<float> &diffs_cmp);
+void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, const std::vector<float> &diffs_cmp);
 
-void tclScaleImage(double scale, float *result_org, float *result_cmp, size_t length);
+void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length);
 
-void tclOpsinDynamicsImage(float* r, float* g, float* b, size_t xsize, size_t ysize,
-	float* result_r, float* result_g, float* result_b);
+void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize,
+	const float* result_r, const float* result_g, const float* result_b);
 
-void tclMinSquareVal(float *img, size_t square_size, size_t offset,
+void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
-	float *values);
+	const float *result);
 
-void tclUpsample(float* image, size_t xsize, size_t ysize,
+void tclUpsample(const float* image, size_t xsize, size_t ysize,
 	size_t xstep, size_t ystep,
-	float* result);
+	const float* result);
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 517b42c3..594adeec 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -197,7 +197,6 @@ void* ocl_args_d_t::allocC(size_t s)
 
 cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
 {
-	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
 	cl_int err = 0;
 	cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
 	if (CL_SUCCESS != err)
@@ -221,19 +220,14 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
 	return mem;
 }
 
-ocl_channels ocl_args_d_t::allocMemChannels(size_t s)
+ocl_channels ocl_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2)
 {
-	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
-	cl_int err = 0;
+	const void *c[3] = { c0, c1, c2 };
 
 	ocl_channels img;
     for (int i = 0; i < 3; i++)
     {
-        img.ch[i] = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
-        if (CL_SUCCESS != err)
-        {
-            LogError("Error: allocMemChannel(%d) for buffer returned %s.\n", i, TranslateOpenCLError(err));
-        }
+        img.ch[i] = allocMem(s, c[i]);
     }
 
 	return img;
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index d72000b3..ed2f1ee2 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -60,7 +60,7 @@ enum KernelName {
 	KERNEL_UPSAMPLESQUAREROOT,
 	KERNEL_ADDBORDER,
 	KERNEL_REMOVEBORDER,
-  KERNEL_AVERAGE5X5,
+    KERNEL_AVERAGE5X5,
 	KERNEL_EDGEDETECTOR,
 	KERNEL_BLOCKDIFFMAP,
 	KERNEL_EDGEDETECTORLOWFREQ,
@@ -78,7 +78,7 @@ struct ocl_args_d_t
 	void* allocC(size_t s);
 
 	cl_mem allocMem(size_t s, const void *init = NULL);
-	ocl_channels allocMemChannels(size_t s);
+	ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
     void releaseMemChannels(ocl_channels rgb);
 
 	// Regular OpenCL objects:

From a469c023ab864003a52e821a1f91c74422a78589 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 24 May 2017 15:17:14 +0800
Subject: [PATCH 108/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp      |  27 ++-----
 clguetzli/clguetzli_test.cpp | 145 ++++++++---------------------------
 2 files changed, 38 insertions(+), 134 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index f6618d3c..ee85cba6 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -801,14 +801,8 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
     }
 
 	size_t channel_size = 512 * 3 * sizeof(double);
-	ocl_channels xyb = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb_dc = ocl.allocMemChannels(channel_size);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb.x, CL_FALSE, 0, channel_size, lut_x, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb.y, CL_FALSE, 0, channel_size, lut_y, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb.b, CL_FALSE, 0, channel_size, lut_b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.x, CL_FALSE, 0, channel_size, lut_dcx, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.y, CL_FALSE, 0, channel_size, lut_dcy, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb_dc.b, CL_FALSE, 0, channel_size, lut_dcb, 0, NULL, NULL);
+	ocl_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
+    ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DOMASK];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
@@ -880,19 +874,11 @@ void clMask(const float* r, const float* g, const float* b,
 
     cl_int channel_size = xsize * ysize * sizeof(float);
 
-    ocl_channels rgb = ocl.allocMemChannels(channel_size);
-    ocl_channels rgb2 = ocl.allocMemChannels(channel_size);
+    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
     ocl_channels mask = ocl.allocMemChannels(channel_size);
     ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-    clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-    clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-    clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-    clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-    clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-    clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-    err = clFinish(ocl.commandQueue);
-
     clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc);
 
     cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
@@ -1105,10 +1091,7 @@ void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
 	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
 	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
-    cl_mem mem_result = ocl.allocMem(channel_size);
-	const float pattern = 0;
-	clEnqueueFillBuffer(ocl.commandQueue, mem_result, &pattern, sizeof(float), 0, res_xsize * res_ysize, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_result, CL_FALSE, 0, channel_step_size, result, 0, NULL, NULL);
+    cl_mem mem_result = ocl.allocMem(channel_size, result);
 
 	cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
 	cl_mem block_diff_dc	 = ocl.allocMem(3 * channel_step_size);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 89d86bb4..58a23d35 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -34,16 +34,8 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
 	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
@@ -87,18 +79,10 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 	cl_mem edge = ocl.allocMem(edgemap_size);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
 	clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge);
 
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err);
@@ -127,20 +111,12 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
 	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
 	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
 	clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
 
 	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
@@ -175,20 +151,10 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0 = ocl.allocMemChannels(channel_size);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size);
-
-	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
+	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb0.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, xyb1.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-    clEnqueueWriteBuffer(ocl.commandQueue, block_diff_ac, CL_FALSE, 0, reschannel_size, orign_ac, 0, NULL, NULL);
-
-	err = clFinish(ocl.commandQueue);
+	cl_mem block_diff_ac = ocl.allocMem(reschannel_size, orign_ac);
 
 	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
 
@@ -215,20 +181,12 @@ void tclMask(const float* r, const float* g, const float* b,
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels rgb = ocl.allocMemChannels(channel_size);
-	ocl_channels rgb2 = ocl.allocMemChannels(channel_size);
+	ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+	ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
 	ocl_channels mask = ocl.allocMemChannels(channel_size);
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.r, CL_FALSE, 0, channel_size, r2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.g, CL_FALSE, 0, channel_size, g2, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb2.b, CL_FALSE, 0, channel_size, b2, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
 	clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/);
 
 	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
@@ -274,24 +232,13 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	ocl_args_d_t &ocl = getOcl();
 
 	size_t channel_size = xsize * ysize * sizeof(float);
-	ocl_channels mask = ocl.allocMemChannels(channel_size);
-	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
-	cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
-	cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
-	cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float));
-	cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float));
-
-	clEnqueueWriteBuffer(ocl.commandQueue, mask.x, CL_FALSE, 0, channel_size, mask_xyb_x, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mask.y, CL_FALSE, 0, channel_size, mask_xyb_y, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mask.b, CL_FALSE, 0, channel_size, mask_xyb_b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.x, CL_FALSE, 0, channel_size, mask_xyb_dc_x, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.y, CL_FALSE, 0, channel_size, mask_xyb_dc_y, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, mask_dc.b, CL_FALSE, 0, channel_size, mask_xyb_dc_b, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_dc, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_dc, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, cl_block_diff_ac, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), block_diff_ac, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, cl_edge_detector_map, CL_FALSE, 0, 3 * res_xsize * res_ysize * sizeof(float), edge_detector_map, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, cl_result, CL_FALSE, 0, res_xsize * res_ysize * sizeof(float), init_result, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+    size_t res_channel_size = res_xsize * res_ysize * sizeof(float);
+	ocl_channels mask = ocl.allocMemChannels(channel_size, mask_xyb_x, mask_xyb_y, mask_xyb_b);
+	ocl_channels mask_dc = ocl.allocMemChannels(channel_size, mask_xyb_dc_x, mask_xyb_dc_y, mask_xyb_dc_b);
+	cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_channel_size, block_diff_dc);
+	cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_channel_size, block_diff_ac);
+	cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float), edge_detector_map);
+	cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float), init_result);
 
 	clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, res_xsize, step, cl_result);
 
@@ -299,7 +246,7 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 
 	FLOAT_COMPARE(result_tmp, result, res_xsize * res_ysize);
 
-  clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, cl_result, result_tmp, 0, NULL, NULL);
 	ocl.releaseMemChannels(mask);
 	ocl.releaseMemChannels(mask_dc);
 	clReleaseMemObject(cl_block_diff_dc);
@@ -322,9 +269,9 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	clEnqueueWriteBuffer(ocl.commandQueue, mem_diffmap, CL_FALSE, 0, org_len * sizeof(float), diffmap, 0, NULL, NULL);
 	clCalculateDiffmapEx(mem_diffmap, xsize, ysize, step);
 	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diffmap, true, CL_MAP_READ, 0, length, 0, NULL, NULL, &err);
-  err = clFinish(ocl.commandQueue);
+    err = clFinish(ocl.commandQueue);
 	FLOAT_COMPARE(result_tmp, diffmap_cmp, xsize * ysize);
-  clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_diffmap, result_tmp, 0, NULL, NULL);
 	clReleaseMemObject(mem_diffmap);
 }
 
@@ -334,10 +281,7 @@ void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, dou
     size_t channel_size = xsize * ysize * sizeof(float);
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
-    cl_mem r = ocl.allocMem(channel_size);
-
-    clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, channel_size, channel, 0, NULL, NULL);
-    err = clFinish(ocl.commandQueue);
+    cl_mem r = ocl.allocMem(channel_size, channel);
 
     clBlurEx(r, xsize, ysize, sigma, border_ratio, r);
 
@@ -369,12 +313,8 @@ void tclConvolution(size_t xsize, size_t ysize,
 	ocl_args_d_t &ocl = getOcl();
 	ocl.allocA(result_size);
 	cl_mem r = ocl.srcA;
-	cl_mem i = ocl.allocMem(inp_size);
-	cl_mem m = ocl.allocMem(multipliers_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, i, CL_FALSE, 0, inp_size, inp, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, m, CL_FALSE, 0, multipliers_size, multipliers, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	cl_mem i = ocl.allocMem(inp_size, inp);
+	cl_mem m = ocl.allocMem(multipliers_size, multipliers);
 
 	clConvolutionEx(i, xsize, ysize, m, len, xstep, offset, border_ratio, r);
 
@@ -401,13 +341,10 @@ void tclUpsample(float* image, size_t xsize, size_t ysize,
 	size_t result_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem img = ocl.allocMem(img_size);
+	cl_mem img = ocl.allocMem(img_size, image);
 	ocl.allocA(result_size);
 	cl_mem r = ocl.srcA;
 
-	clEnqueueWriteBuffer(ocl.commandQueue, img, CL_FALSE, 0, img_size, image, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
 	clUpsampleEx(img, xsize, ysize, xstep, ystep, r);
 
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
@@ -431,18 +368,10 @@ void tclDiffPrecompute(
   cl_int err = 0;
   ocl_args_d_t &ocl = getOcl();
   size_t channel_size = xsize * ysize * sizeof(float);
-  ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size);
-  ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size);
+  ocl_channels cl_xyb0 = ocl.allocMemChannels(channel_size, xyb0[0].data(), xyb0[1].data(), xyb0[2].data());
+  ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size, xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
   ocl_channels cl_mask = ocl.allocMemChannels(channel_size);
 
-  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.x, CL_FALSE, 0, channel_size, xyb0[0].data(), 0, NULL, NULL);
-  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.y, CL_FALSE, 0, channel_size, xyb0[1].data(), 0, NULL, NULL);
-  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb0.b, CL_FALSE, 0, channel_size, xyb0[2].data(), 0, NULL, NULL);
-  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.x, CL_FALSE, 0, channel_size, xyb1[0].data(), 0, NULL, NULL);
-  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.y, CL_FALSE, 0, channel_size, xyb1[1].data(), 0, NULL, NULL);
-  clEnqueueWriteBuffer(ocl.commandQueue, cl_xyb1.b, CL_FALSE, 0, channel_size, xyb1[2].data(), 0, NULL, NULL);
-
-
   clDiffPrecomputeEx(cl_xyb0, cl_xyb1, xsize, ysize, cl_mask);
 
   cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
@@ -467,8 +396,8 @@ void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, co
 {
   cl_int err = 0;
   ocl_args_d_t &ocl = getOcl();
-  cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float));
-  clEnqueueWriteBuffer(ocl.commandQueue, mem_diff, CL_FALSE, 0, xsize * ysize * sizeof(float), diffs_org.data(), 0, NULL, NULL);
+  cl_mem mem_diff = ocl.allocMem(xsize * ysize * sizeof(float), diffs_org.data());
+
   clAverage5x5Ex(mem_diff, xsize, ysize);
   cl_float *r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_diff, true, CL_MAP_READ, 0, xsize * ysize * sizeof(float), 0, NULL, NULL, &err);
   err = clFinish(ocl.commandQueue);
@@ -486,10 +415,7 @@ void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 	size_t img_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	cl_mem r = ocl.allocMem(img_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, r, CL_FALSE, 0, img_size, img, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	cl_mem r = ocl.allocMem(img_size, img);
 
 	clMinSquareValEx(r, xsize, ysize, square_size, offset);
 
@@ -508,8 +434,8 @@ void tclScaleImage(double scale, const float *result_org, const float *result_cm
 {
   cl_int err = 0;
   ocl_args_d_t &ocl = getOcl();
-  cl_mem mem_result_org = ocl.allocMem(length * sizeof(float));
-  clEnqueueWriteBuffer(ocl.commandQueue, mem_result_org, CL_FALSE, 0, length * sizeof(float), result_org, 0, NULL, NULL);
+  cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org);
+
   clScaleImageEx(mem_result_org, length, scale);
 
   cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err);
@@ -528,12 +454,7 @@ void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_
 	size_t channel_size = xsize * ysize * sizeof(float);
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
-	ocl_channels rgb = ocl.allocMemChannels(channel_size);
-
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.r, CL_FALSE, 0, channel_size, r, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.g, CL_FALSE, 0, channel_size, g, 0, NULL, NULL);
-	clEnqueueWriteBuffer(ocl.commandQueue, rgb.b, CL_FALSE, 0, channel_size, b, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
 
 	clOpsinDynamicsImageEx(rgb, xsize, ysize);
 

From e68cea4493b60ce77d7933d3275cc6a595a3559f Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 24 May 2017 15:40:13 +0800
Subject: [PATCH 109/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=E9=A1=BA=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp |  15 +-
 clguetzli/clguetzli.cpp           | 450 +++++++++++++++---------------
 clguetzli/clguetzli.h             |  25 +-
 guetzli/processor.cc              |  20 +-
 4 files changed, 260 insertions(+), 250 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index c61c8578..c6b4ca0b 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -17,8 +17,8 @@ namespace butteraugli
         if (g_useOpenCL && xsize_ > 100 && ysize_ > 100)
         {
             result.resize(xsize_ * ysize_);
-            clDiffmapOpsinDynamicsImage(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_, result.data());
+            clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
         else
         {
@@ -163,11 +163,12 @@ namespace butteraugli
                 (*mask)[i].resize(xsize * ysize);
                 (*mask_dc)[i].resize(xsize * ysize);
             }
-            clMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+            clMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(),
                 xsize, ysize,
-                (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
-                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+                xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
+                );
             return;
         }
 
@@ -279,7 +280,7 @@ namespace butteraugli
             float * g = rgb[1].data();
             float * b = rgb[2].data();
 
-            clOpsinDynamicsImage(xsize, ysize, r, g, b);
+            clOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
         else
         {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index ee85cba6..fa3507a4 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -68,6 +68,233 @@ ocl_args_d_t& getOcl(void)
 	return ocl;
 }
 
+void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
+{
+    cl_int channel_size = xsize * ysize * sizeof(float);
+
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+
+    clOpsinDynamicsImageEx(rgb, xsize, ysize);
+
+    cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+
+    err = clFinish(ocl.commandQueue);
+
+    memcpy(r, result_r, channel_size);
+    memcpy(g, result_g, channel_size);
+    memcpy(b, result_b, channel_size);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL);
+    clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    ocl.releaseMemChannels(rgb);
+}
+
+void clDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2,
+    size_t xsize, size_t ysize,
+    size_t step)
+{
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    cl_int channel_size = xsize * ysize * sizeof(float);
+    cl_int channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+    ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+    cl_mem mem_result = ocl.allocMem(channel_size, result);
+
+    cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
+
+    clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map);
+    clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
+    clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
+    {
+        ocl_channels mask = ocl.allocMemChannels(channel_size);
+        ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+        clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+        clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result);
+
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
+
+    clCalculateDiffmapEx(mem_result, xsize, ysize, step);
+
+    cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+    memcpy(result, result_r, channel_size);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    ocl.releaseMemChannels(xyb1);
+    ocl.releaseMemChannels(xyb0);
+
+    clReleaseMemObject(edge_detector_map);
+    clReleaseMemObject(block_diff_dc);
+    clReleaseMemObject(block_diff_ac);
+
+    clReleaseMemObject(mem_result);
+}
+
+void clComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit)
+{
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
+
+    using namespace guetzli;
+
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+
+    cl_mem mem_orig_coeff[3];
+    cl_mem mem_mayout_coeff[3];
+    cl_mem mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+
+        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+    }
+    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size);
+    cl_float clBlockErrorLimit = BlockErrorLimit;
+    cl_int clWidth = image_width;
+    cl_int clHeight = image_height;
+    cl_int clFactor = factor;
+    cl_int clMask = comp_mask;
+
+    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]);
+    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image);
+    clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale);
+    clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth);
+    clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight);
+    clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]);
+    clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]);
+    clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]);
+    clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]);
+    clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]);
+    clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]);
+    clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]);
+    clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]);
+    clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]);
+    clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor);
+    clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask);
+    clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit);
+    clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch);
+
+    size_t globalWorkSize[2] = { blockf_width, blockf_height };
+    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    if (CL_SUCCESS != err)
+    {
+        LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
+    }
+    err = clFinish(ocl.commandQueue);
+    if (CL_SUCCESS != err)
+    {
+        LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
+    }
+
+    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+    memcpy(output_order_batch, result, output_order_batch_size);
+
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL);
+    clFinish(ocl.commandQueue);
+
+    for (int c = 0; c < 3; c++)
+    {
+        clReleaseMemObject(mem_orig_coeff[c]);
+        clReleaseMemObject(mem_mayout_coeff[c]);
+        clReleaseMemObject(mem_mayout_pixel[c]);
+
+    }
+
+    clReleaseMemObject(mem_orig_image);
+    clReleaseMemObject(mem_mask_scale);
+    clReleaseMemObject(mem_output_order_batch);
+}
+
+void clMask(
+    float* mask_r,  float* mask_g,    float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    size_t xsize, size_t ysize,
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2)
+{
+    cl_int err = CL_SUCCESS;
+    ocl_args_d_t &ocl = getOcl();
+
+    cl_int channel_size = xsize * ysize * sizeof(float);
+
+    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+    ocl_channels mask = ocl.allocMemChannels(channel_size);
+    ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+
+    clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc);
+
+    cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
+
+    memcpy(mask_r, r0_r, channel_size);
+    memcpy(mask_g, r0_g, channel_size);
+    memcpy(mask_b, r0_b, channel_size);
+    memcpy(maskdc_r, r1_r, channel_size);
+    memcpy(maskdc_g, r1_g, channel_size);
+    memcpy(maskdc_b, r1_b, channel_size);
+
+    ocl.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb2);
+    ocl.releaseMemChannels(mask);
+    ocl.releaseMemChannels(mask_dc);
+}
+
 void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 				     cl_mem multipliers, size_t len,
                      int xstep, int offset, double border_ratio,
@@ -353,33 +580,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, con
 	ocl.releaseMemChannels(rgb_blurred);
 }
 
-void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b)
-{
-	cl_int channel_size = xsize * ysize * sizeof(float);
-
-	cl_int err = 0;
-	ocl_args_d_t &ocl = getOcl();
-    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
-
-	clOpsinDynamicsImageEx(rgb, xsize, ysize);
-
-	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-	cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-	cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-
-	err = clFinish(ocl.commandQueue);
-
-	memcpy(r, result_r, channel_size);
-	memcpy(g, result_g, channel_size);
-	memcpy(b, result_b, channel_size);
-
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL);
-	clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL);
-	clFinish(ocl.commandQueue);
-
-    ocl.releaseMemChannels(rgb);
-}
 
 void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
 								 ocl_channels &xyb1/*in,out*/,
@@ -863,45 +1063,6 @@ void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2,
     }
 }
 
-void clMask(const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2,
-    size_t xsize, size_t ysize,
-    float* mask_r, float* mask_g, float* mask_b,
-    float* maskdc_r, float* maskdc_g, float* maskdc_b)
-{
-    cl_int err = CL_SUCCESS;
-    ocl_args_d_t &ocl = getOcl();
-
-    cl_int channel_size = xsize * ysize * sizeof(float);
-
-    ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
-    ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-    ocl_channels mask = ocl.allocMemChannels(channel_size);
-    ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
-
-    clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc);
-
-    cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    err = clFinish(ocl.commandQueue);
-
-    memcpy(mask_r, r0_r, channel_size);
-    memcpy(mask_g, r0_g, channel_size);
-    memcpy(mask_b, r0_b, channel_size);
-    memcpy(maskdc_r, r1_r, channel_size);
-    memcpy(maskdc_g, r1_g, channel_size);
-    memcpy(maskdc_b, r1_b, channel_size);
-
-    ocl.releaseMemChannels(rgb);
-    ocl.releaseMemChannels(rgb2);
-    ocl.releaseMemChannels(mask);
-    ocl.releaseMemChannels(mask_dc);
-}
-
 void clCombineChannelsEx(
 	const ocl_channels &mask,
 	const ocl_channels &mask_dc,
@@ -1073,160 +1234,3 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 	clReleaseMemObject(blurred);
 }
 
-void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
-								 const float* r2, const float* g2, const float* b2,
-								 size_t xsize, size_t ysize,
-								 size_t step,
-								 float* result)
-{
-
-	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (ysize + step - 1) / step;
-
-	cl_int channel_size      = xsize * ysize * sizeof(float);
-	cl_int channel_step_size = res_xsize * res_ysize * sizeof(float);
-
-	cl_int err = 0;
-	ocl_args_d_t &ocl = getOcl();
-	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
-	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-
-    cl_mem mem_result = ocl.allocMem(channel_size, result);
-
-	cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
-	cl_mem block_diff_dc	 = ocl.allocMem(3 * channel_step_size);
-	cl_mem block_diff_ac	 = ocl.allocMem(3 * channel_step_size);
-
-	clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
-
-	clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map);
-	clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
-	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
-    {
-        ocl_channels mask = ocl.allocMemChannels(channel_size);
-        ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
-        clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
-        clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result);
-
-        ocl.releaseMemChannels(mask);
-        ocl.releaseMemChannels(mask_dc);
-    }
-
-    clCalculateDiffmapEx(mem_result, xsize, ysize, step);
-
-	cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-	err = clFinish(ocl.commandQueue);
-	memcpy(result, result_r, channel_size);
-
-	clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL);
-	clFinish(ocl.commandQueue);
-
-	ocl.releaseMemChannels(xyb1);
-	ocl.releaseMemChannels(xyb0);
-
-	clReleaseMemObject(edge_detector_map);
-	clReleaseMemObject(block_diff_dc);
-	clReleaseMemObject(block_diff_ac);
-
-	clReleaseMemObject(mem_result);
-}
-
-void clComputeBlockZeroingOrder(
-    const channel_info orig_channel[3],
-    const float *orig_image_batch,
-    const float *mask_scale,
-    const int image_width,
-    const int image_height,
-    const channel_info mayout_channel[3],
-    const int factor,
-    const int comp_mask,
-    const float BlockErrorLimit,
-    guetzli::CoeffData *output_order_batch)
-{
-    const int block8_width = (image_width + 8 - 1) / 8;
-    const int block8_height = (image_height + 8 - 1) / 8;
-    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
-    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
-
-    using namespace guetzli;
-
-    cl_int err = 0;
-    ocl_args_d_t &ocl = getOcl();
-
-    cl_mem mem_orig_coeff[3];
-    cl_mem mem_mayout_coeff[3];
-    cl_mem mem_mayout_pixel[3];
-    for (int c = 0; c < 3; c++)
-    {
-        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
-        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
-
-        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
-        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
-
-        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
-    }
-    cl_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
-    cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
-
-    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
-    cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size);
-    cl_float clBlockErrorLimit = BlockErrorLimit;
-    cl_int clWidth = image_width;
-    cl_int clHeight = image_height;
-    cl_int clFactor = factor;
-    cl_int clMask = comp_mask;
-
-    cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]);
-    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image);
-    clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale);
-    clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth);
-    clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight);
-    clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]);
-    clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]);
-    clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]);
-    clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]);
-    clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]);
-    clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]);
-    clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]);
-    clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]);
-    clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]);
-    clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor);
-    clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask);
-    clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit);
-    clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch);
-
-    size_t globalWorkSize[2] = {  blockf_width, blockf_height};
-    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-    if (CL_SUCCESS != err)
-    {
-        LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-    }
-    err = clFinish(ocl.commandQueue);
-    if (CL_SUCCESS != err)
-    {
-        LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
-    }
-
-    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err);
-    err = clFinish(ocl.commandQueue);
-    memcpy(output_order_batch, result, output_order_batch_size);
-
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL);
-    clFinish(ocl.commandQueue);
-
-    for (int c = 0; c < 3; c++)
-    {
-        clReleaseMemObject(mem_orig_coeff[c]);
-        clReleaseMemObject(mem_mayout_coeff[c]);
-        clReleaseMemObject(mem_mayout_pixel[c]);
-
-    }
-
-    clReleaseMemObject(mem_orig_image);
-    clReleaseMemObject(mem_mask_scale);
-    clReleaseMemObject(mem_output_order_batch);
-}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 4d4a2fcf..a8be9a42 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -9,15 +9,19 @@
 extern bool g_useOpenCL;
 extern bool g_checkOpenCL;
 
-void clOpsinDynamicsImage(const size_t xsize, const size_t ysize, float* r, float* g, float* b);
+void clOpsinDynamicsImage(
+    float *r, float *g, float *b,
+    const size_t xsize, const size_t ysize);
 
-void clDiffmapOpsinDynamicsImage(const float* r, const float* g, const float* b,
+void clDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2,
     const size_t xsize, const size_t ysize,
-    const size_t step,
-    float* result);
+    const size_t step);
 
 void clComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
     const channel_info orig_channel[3],
     const float *orig_image_batch,
     const float *mask_scale,
@@ -26,14 +30,15 @@ void clComputeBlockZeroingOrder(
     const channel_info mayout_channel[3],
     const int factor,
     const int comp_mask,
-    const float BlockErrorLimit,
-    guetzli::CoeffData *output_order_batch);
+    const float BlockErrorLimit
+    );
 
-void clMask(const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2,
+void clMask(
+    float* mask_r,   float* mask_g,   float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
     const size_t xsize, const size_t ysize,
-    float* mask_r, float* mask_g, float* mask_b,
-    float* maskdc_r, float* maskdc_g, float* maskdc_b);
+    const float* r,  const float* g,  const float* b,
+    const float* r2, const float* g2, const float* b2);
 
 void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
 	ocl_channels &xyb1/*in,out*/,
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index e5439460..2c9811a9 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -588,16 +588,16 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         output_order_gpu.resize(num_blocks * kBlockSize);
         output_order = output_order_gpu.data();
 
-        clComputeBlockZeroingOrder(orig_channel,
-                                        comp->imgOpsinDynamicsBlockList.data(),
-                                        comp->imgMaskXyzScaleBlockList.data(),
-                                        width,
-                                        height,
-                                        mayout_channel,
-                                        factor_x,
-                                        comp_mask,
-                                        comp->BlockErrorLimit(),
-                                        output_order);
+        clComputeBlockZeroingOrder(output_order,
+                                    orig_channel,
+                                    comp->imgOpsinDynamicsBlockList.data(),
+                                    comp->imgMaskXyzScaleBlockList.data(),
+                                    width,
+                                    height,
+                                    mayout_channel,
+                                    factor_x,
+                                    comp_mask,
+                                    comp->BlockErrorLimit());
 
     }
     if (!g_useOpenCL || g_checkOpenCL)

From 7c9c34ad258b119e2b92e0f9aa79b4aae0ae5ab4 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 24 May 2017 19:18:15 +0800
Subject: [PATCH 110/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=E8=A7=84=E5=88=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp      | 225 +++++++++++++++++++----------------
 clguetzli/clguetzli.h        | 117 ++++++++++++------
 clguetzli/clguetzli_test.cpp |  20 ++--
 3 files changed, 214 insertions(+), 148 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index fa3507a4..67a7f2a0 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -123,14 +123,14 @@ void clDiffmapOpsinDynamicsImage(
 
     clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
-    clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge_detector_map);
-    clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
-    clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
+    clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
     {
         ocl_channels mask = ocl.allocMemChannels(channel_size);
         ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
-        clMaskEx(xyb0, xyb1, xsize, ysize, mask, mask_dc);
-        clCombineChannelsEx(mask, mask_dc, block_diff_dc, block_diff_ac, edge_detector_map, xsize, ysize, res_xsize, step, mem_result);
+        clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        clCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
 
         ocl.releaseMemChannels(mask);
         ocl.releaseMemChannels(mask_dc);
@@ -272,7 +272,7 @@ void clMask(
     ocl_channels mask = ocl.allocMemChannels(channel_size);
     ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-    clMaskEx(rgb, rgb2, xsize, ysize, mask, mask_dc);
+    clMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
 
     cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
     cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
@@ -295,10 +295,11 @@ void clMask(
     ocl.releaseMemChannels(mask_dc);
 }
 
-void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
-				     cl_mem multipliers, size_t len,
-                     int xstep, int offset, double border_ratio,
-                     cl_mem result/*out*/)
+void clConvolutionEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -334,10 +335,11 @@ void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
 	}
 }
 
-void clConvolutionX(cl_mem inp, size_t xsize, size_t ysize,
-	cl_mem multipliers, size_t len,
-	int xstep, int offset, double border_ratio,
-	cl_mem result/*out*/)
+void clConvolutionX(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+	const cl_mem multipliers, size_t len,
+	int xstep, int offset, double border_ratio)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -369,10 +371,12 @@ void clConvolutionX(cl_mem inp, size_t xsize, size_t ysize,
 	}
 }
 
-void clConvolutionY(cl_mem inp, size_t xsize, size_t ysize,
-	cl_mem multipliers, size_t len,
-	int xstep, int offset, double border_ratio,
-	cl_mem result/*out*/)
+void clConvolutionY(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+	const cl_mem multipliers, size_t len,
+	int xstep, int offset, double border_ratio
+	)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -404,9 +408,10 @@ void clConvolutionY(cl_mem inp, size_t xsize, size_t ysize,
 	}
 }
 
-void clUpsampleEx2(cl_mem image, size_t xsize, size_t ysize,
-	size_t xstep, size_t ystep,
-	cl_mem result/*out*/)
+void clUpsampleEx2(
+    cl_mem result/*out*/,
+    const cl_mem image, size_t xsize, size_t ysize,
+	size_t xstep, size_t ystep)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -432,9 +437,11 @@ void clUpsampleEx2(cl_mem image, size_t xsize, size_t ysize,
 	}
 }
 
-void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
-                  size_t xstep, size_t ystep,
-                  cl_mem result/*out*/)
+void clUpsampleEx(
+    cl_mem result/*out*/,
+    const cl_mem image,
+    const size_t xsize, const size_t ysize,
+    const size_t xstep, const size_t ystep)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -460,49 +467,55 @@ void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
 	}
 }
 
-void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
-	double sigma, double border_ratio,
-	cl_mem result/*out, opt*/)
+void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    cl_mem result/*out, opt*/)
 {
-	double m = 2.25;  // Accuracy increases when m is increased.
-	const double scaler = -1.0 / (2 * sigma * sigma);
-	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
-	const int diff = std::max<int>(1, m * fabs(sigma));
-	const int expn_size = 2 * diff + 1;
-	std::vector<float> expn(expn_size);
-	for (int i = -diff; i <= diff; ++i) {
-		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
-	}
+    clBlurEx2(image, xsize, ysize, sigma, border_ratio, result);
+
+    return;
+    double m = 2.25;  // Accuracy increases when m is increased.
+    const double scaler = -1.0 / (2 * sigma * sigma);
+    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+    const int diff = std::max<int>(1, m * fabs(sigma));
+    const int expn_size = 2 * diff + 1;
+    std::vector<float> expn(expn_size);
+    for (int i = -diff; i <= diff; ++i) {
+        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+    }
 
-	const int xstep = std::max<int>(1, int(sigma / 3));
+    const int xstep = std::max<int>(1, int(sigma / 3));
+    const int ystep = xstep;
+    int dxsize = (xsize + xstep - 1) / xstep;
+    int dysize = (ysize + ystep - 1) / ystep;
 
-	cl_int err = 0;
-	ocl_args_d_t &ocl = getOcl();
-	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
-	if (xstep > 1)
-	{
-		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
-		clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image);
-		clUpsampleEx2(result ? result : image, xsize, ysize, xstep, xstep, result ? result : image);
-	}
-	else
-	{
-		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionX(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
-		clConvolutionY(ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, result ? result : image);
-	}
+    if (xstep > 1)
+    {
+        ocl.allocA(sizeof(cl_float) * dxsize * ysize);
+        ocl.allocB(sizeof(cl_float) * dxsize * dysize);
 
-	clReleaseMemObject(mem_expn);
+        clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        clConvolutionEx(ocl.srcB, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio);
+        clUpsampleEx(result ? result : image, ocl.srcB, xsize, ysize, xstep, ystep);
+    }
+    else
+    {
+        ocl.allocA(sizeof(cl_float) * xsize * ysize);
+        clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        clConvolutionEx(result ? result : image, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio);
+    }
+
+    clReleaseMemObject(mem_expn);
 }
-void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
-              const double sigma, const double border_ratio,
-              cl_mem result/*out, opt*/)
-{
-	clBlurEx2(image, xsize, ysize, sigma, border_ratio, result);
 
-	return;
+void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
+	double sigma, double border_ratio,
+    cl_mem result/*out, opt*/)
+{
 	double m = 2.25;  // Accuracy increases when m is increased.
 	const double scaler = -1.0 / (2 * sigma * sigma);
 	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
@@ -514,9 +527,6 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
 	}
 
 	const int xstep = std::max<int>(1, int(sigma / 3));
-	const int ystep = xstep;
-	int dxsize = (xsize + xstep - 1) / xstep;
-	int dysize = (ysize + ystep - 1) / ystep;
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -524,24 +534,22 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
 
 	if (xstep > 1)
 	{
-		ocl.allocA(sizeof(cl_float) * dxsize * ysize);
-		ocl.allocB(sizeof(cl_float) * dxsize * dysize);
-
-		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
-		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, ocl.srcB);
-		clUpsampleEx(ocl.srcB, xsize, ysize, xstep, ystep, result ? result : image);
+		ocl.allocA(sizeof(cl_float) * xsize * ysize);
+		clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clUpsampleEx2(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
 	}
 	else
 	{
 		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionEx(image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio, ocl.srcA);
-		clConvolutionEx(ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio, result ? result : image);
+		clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
 	}
 
 	clReleaseMemObject(mem_expn);
 }
 
-void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize)
+void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize)
 {
 	static const double kSigma = 1.1;
 
@@ -581,9 +589,10 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, con
 }
 
 
-void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
-								 ocl_channels &xyb1/*in,out*/,
-                                 const size_t xsize, const size_t ysize)
+void clMaskHighIntensityChangeEx(
+    ocl_channels &xyb0/*in,out*/,
+    ocl_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
@@ -631,8 +640,10 @@ void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
 	ocl.releaseMemChannels(c1);
 }
 
-void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/)
+void clEdgeDetectorMapEx(
+    cl_mem result/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
@@ -685,9 +696,11 @@ void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
 	ocl.releaseMemChannels(rgb2_blured);
 }
 
-void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-	const size_t xsize, const size_t ysize, const size_t step,
-	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/)
+void clBlockDiffMapEx(
+    cl_mem block_diff_dc/*out*/,
+    cl_mem block_diff_ac/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -725,9 +738,10 @@ void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
 	}
 }
 
-void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-	const size_t xsize, const size_t ysize, const size_t step,
-	cl_mem block_diff_ac/*out*/)
+void clEdgeDetectorLowFreqEx(
+    cl_mem block_diff_ac/*in,out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
 
@@ -779,7 +793,10 @@ void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2,
 	ocl.releaseMemChannels(rgb2_blured);
 }
 
-void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/)
+void clDiffPrecomputeEx(
+    ocl_channels &mask/*out*/,
+    const ocl_channels &xyb0, const ocl_channels &xyb1,
+    const size_t xsize, const size_t ysize)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -832,7 +849,7 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 	}
 }
 
-void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
+void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize)
 {
 	if (xsize < 4 || ysize < 4) {
 		// TODO: Make this work for small dimensions as well.
@@ -865,7 +882,10 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize)
   }
 }
 
-void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset)
+void clMinSquareValEx(
+    cl_mem img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -899,7 +919,6 @@ void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t s
 	}
 }
 
-
 static void MakeMask(double extmul, double extoff,
 	double mul, double offset,
 	double scaler, double *result)
@@ -1034,12 +1053,12 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	ocl.releaseMemChannels(xyb_dc);
 }
 
-
-void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-	const size_t xsize, const size_t ysize,
-	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/)
+void clMaskEx(
+    ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize)
 {
-    clDiffPrecomputeEx(rgb, rgb2, xsize, ysize, mask);
+    clDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize);
     for (int i = 0; i < 3; i++)
     {
         clAverage5x5Ex(mask.ch[i], xsize, ysize);
@@ -1064,15 +1083,15 @@ void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2,
 }
 
 void clCombineChannelsEx(
+    cl_mem result/*out*/,
 	const ocl_channels &mask,
 	const ocl_channels &mask_dc,
-	cl_mem block_diff_dc,
-	cl_mem block_diff_ac,
-	cl_mem edge_detector_map,
-	size_t xsize, size_t ysize,
-	size_t res_xsize,
-	size_t step,
-	cl_mem result/*out*/)
+    const size_t xsize, const size_t ysize,
+	const cl_mem block_diff_dc,
+	const cl_mem block_diff_ac,
+	const cl_mem edge_detector_map,
+	const size_t res_xsize,
+	const size_t step)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -1114,7 +1133,7 @@ void clCombineChannelsEx(
 	}
 }
 
-void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step)
+void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -1156,7 +1175,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, size_t xsize, size_t ysize, int step
     clReleaseMemObject(mem_diffmap);
 }
 
-void clRemoveBorderEx(cl_mem in, size_t xsize, size_t ysize, int step, cl_mem out)
+void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step)
 {
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -1210,7 +1229,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
 	}
 }
 
-void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step)
+void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
 {
 	clUpsampleSquareRootEx(diffmap, xsize, ysize, step);
 
@@ -1223,7 +1242,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize,
 
 	ocl_args_d_t &ocl = getOcl();
 	cl_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
-	clRemoveBorderEx(diffmap, xsize, ysize, step, blurred);
+	clRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
 
 	static const double border_ratio = 0.03027655136;
 	clBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index a8be9a42..f1aa6c22 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -40,58 +40,105 @@ void clMask(
     const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2);
 
-void clMaskHighIntensityChangeEx(ocl_channels &xyb0/*in,out*/,
+void clConvolutionEx(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, double border_ratio);
+
+void clConvolutionX(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, double border_ratio);
+
+void clConvolutionY(
+    cl_mem result/*out*/,
+    const cl_mem inp, size_t xsize, size_t ysize,
+    const cl_mem multipliers, size_t len,
+    int xstep, int offset, double border_ratio);
+
+void clUpsampleEx2(
+    cl_mem result/*out*/,
+    const cl_mem image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep);
+
+void clUpsampleEx(
+    cl_mem result/*out*/,
+    const cl_mem image,
+    const size_t xsize, const size_t ysize,
+    const size_t xstep, const size_t ystep);
+
+void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    cl_mem result = nullptr/*out, opt*/);
+
+void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
+    double sigma, double border_ratio,
+    cl_mem result = NULL/*out, opt*/);
+
+void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize);
+
+void clMaskHighIntensityChangeEx(
+    ocl_channels &xyb0/*in,out*/,
 	ocl_channels &xyb1/*in,out*/,
 	const size_t xsize, const size_t ysize);
 
-void clMaskEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-	const size_t xsize, const size_t ysize,
-	ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/);
-
-void clEdgeDetectorMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step, cl_mem result/*out*/);
+void clEdgeDetectorMapEx(
+    cl_mem result/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void clBlockDiffMapEx(
+    cl_mem block_diff_dc/*out*/,
+    cl_mem block_diff_ac/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize, const size_t step);
+
+void clEdgeDetectorLowFreqEx(
+    cl_mem block_diff_ac/*in,out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void clDiffPrecomputeEx(
+    ocl_channels &mask/*out*/,
+    const ocl_channels &xyb0, const ocl_channels &xyb1,
+    const size_t xsize, const size_t ysize);
 
-void clBlockDiffMapEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-	const size_t xsize, const size_t ysize, const size_t step,
-	cl_mem block_diff_dc/*out*/, cl_mem block_diff_ac/*out*/);
+void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
 
-void clEdgeDetectorLowFreqEx(const ocl_channels &rgb, const ocl_channels &rgb2,
-	const size_t xsize, const size_t ysize, const size_t step,
-	cl_mem block_diff_ac/*in,out*/);
+void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize);
 
-void clBlurEx(cl_mem image, const size_t xsize, const size_t ysize, const double sigma, const double border_ratio, cl_mem result = nullptr);
+void clMinSquareValEx(
+    cl_mem img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset);
 
-void clOpsinDynamicsImageEx(ocl_channels &rgb/*in,out*/, const size_t xsize, const size_t ysize);
+void clMaskEx(
+    ocl_channels mask/*out*/, ocl_channels mask_dc/*out*/,
+    const ocl_channels &rgb, const ocl_channels &rgb2,
+	const size_t xsize, const size_t ysize);
 
 void clCombineChannelsEx(
+    cl_mem result/*out*/,
 	const ocl_channels &mask,
 	const ocl_channels &mask_dc,
-	cl_mem block_diff_dc,
-	cl_mem block_diff_ac,
-	cl_mem edge_detector_map,
-	size_t xsize, size_t ysize,
-	size_t res_xsize,
-	size_t step,
-	cl_mem result/*out*/);
-
-void clConvolutionEx(cl_mem inp, size_t xsize, size_t ysize,
-	cl_mem multipliers, size_t len,
-	int xstep, int offset, double border_ratio,
-	cl_mem result/*out*/);
-
-void clMinSquareValEx(cl_mem img/*in,out*/, size_t xsize, size_t ysize, size_t square_size, size_t offset);
+    const size_t xsize, const size_t ysize,
+	const cl_mem block_diff_dc,
+	const cl_mem block_diff_ac,
+	const cl_mem edge_detector_map,
+	const size_t res_xsize,
+	const size_t step);
 
-void clUpsampleEx(cl_mem image, size_t xsize, size_t ysize,
-	size_t xstep, size_t ystep,
-	cl_mem result/*out*/);
+void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step);
 
 void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step);
 
-void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
+void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step);
 
-void clDiffPrecomputeEx(ocl_channels xyb0, ocl_channels xyb1, size_t xsize, size_t ysize, ocl_channels mask/*out*/);
+void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int step, const cl_mem in);
 
-void clAverage5x5Ex(cl_mem img/*in,out*/, size_t xsize, size_t ysize);
+void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
 
 class guetzli::OutputImage;
 
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 58a23d35..28ae9d1b 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -83,7 +83,7 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 	cl_mem edge = ocl.allocMem(edgemap_size);
 
-	clEdgeDetectorMapEx(xyb0, xyb1, xsize, ysize, step, edge);
+	clEdgeDetectorMapEx(edge, xyb0, xyb1, xsize, ysize, step);
 
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, edge, true, CL_MAP_READ, 0, edgemap_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
@@ -117,7 +117,7 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
 	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
 
-	clBlockDiffMapEx(xyb0, xyb1, xsize, ysize, step, block_diff_dc, block_diff_ac);
+	clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
 
 	cl_float *r_dc = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_dc, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
 	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
@@ -156,7 +156,7 @@ void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 
 	cl_mem block_diff_ac = ocl.allocMem(reschannel_size, orign_ac);
 
-	clEdgeDetectorLowFreqEx(xyb0, xyb1, xsize, ysize, step, block_diff_ac);
+	clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
 
 	cl_float *r_ac = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, block_diff_ac, true, CL_MAP_READ, 0, reschannel_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
@@ -187,7 +187,7 @@ void tclMask(const float* r, const float* g, const float* b,
 	ocl_channels mask = ocl.allocMemChannels(channel_size);
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-	clMaskEx(rgb, rgb2, xsize, ysize, mask/*out*/, mask_dc/*out*/);
+	clMaskEx(mask/*out*/, mask_dc/*out*/, rgb, rgb2, xsize, ysize);
 
 	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
 	cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
@@ -237,10 +237,10 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size, mask_xyb_dc_x, mask_xyb_dc_y, mask_xyb_dc_b);
 	cl_mem cl_block_diff_dc = ocl.allocMem(3 * res_channel_size, block_diff_dc);
 	cl_mem cl_block_diff_ac = ocl.allocMem(3 * res_channel_size, block_diff_ac);
-	cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_xsize * res_ysize * sizeof(float), edge_detector_map);
-	cl_mem cl_result = ocl.allocMem(res_xsize * res_ysize * sizeof(float), init_result);
+	cl_mem cl_edge_detector_map = ocl.allocMem(3 * res_channel_size, edge_detector_map);
+	cl_mem cl_result = ocl.allocMem(res_channel_size, init_result);
 
-	clCombineChannelsEx(mask, mask_dc, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, xsize, ysize, res_xsize, step, cl_result);
+	clCombineChannelsEx(cl_result, mask, mask_dc, xsize, ysize, cl_block_diff_dc, cl_block_diff_ac, cl_edge_detector_map, res_xsize, step);
 
 	cl_float *result_tmp = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_result, true, CL_MAP_READ, 0, res_xsize * res_ysize * sizeof(float), 0, NULL, NULL, &err);
 
@@ -316,7 +316,7 @@ void tclConvolution(size_t xsize, size_t ysize,
 	cl_mem i = ocl.allocMem(inp_size, inp);
 	cl_mem m = ocl.allocMem(multipliers_size, multipliers);
 
-	clConvolutionEx(i, xsize, ysize, m, len, xstep, offset, border_ratio, r);
+	clConvolutionEx(r, i, xsize, ysize, m, len, xstep, offset, border_ratio);
 
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
@@ -345,7 +345,7 @@ void tclUpsample(float* image, size_t xsize, size_t ysize,
 	ocl.allocA(result_size);
 	cl_mem r = ocl.srcA;
 
-	clUpsampleEx(img, xsize, ysize, xstep, ystep, r);
+	clUpsampleEx(r, img, xsize, ysize, xstep, ystep);
 
 	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
 	err = clFinish(ocl.commandQueue);
@@ -372,7 +372,7 @@ void tclDiffPrecompute(
   ocl_channels cl_xyb1 = ocl.allocMemChannels(channel_size, xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
   ocl_channels cl_mask = ocl.allocMemChannels(channel_size);
 
-  clDiffPrecomputeEx(cl_xyb0, cl_xyb1, xsize, ysize, cl_mask);
+  clDiffPrecomputeEx(cl_mask, cl_xyb0, cl_xyb1, xsize, ysize);
 
   cl_float *r_x = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.x, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
   cl_float *r_y = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, cl_mask.y, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);

From b0d7b80346790b41204a419a6f598f518407c7fc Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 24 May 2017 20:21:16 +0800
Subject: [PATCH 111/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=E8=A7=84=E8=8C=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl       | 707 ++++++++++++++++++-----------------
 clguetzli/clguetzli.cpp      | 187 ++++-----
 clguetzli/clguetzli.h        |   2 +-
 clguetzli/clguetzli_test.cpp |  28 --
 clguetzli/clguetzli_test.h   |   4 -
 clguetzli/ocl.h              |   1 -
 6 files changed, 432 insertions(+), 497 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 9722b08d..9639c018 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -7,6 +7,7 @@
 #define kDCTBlockSize   (kBlockEdge * kBlockEdge)
 #define kBlockEdgeHalf  (kBlockEdge / 2)
 #define kBlockHalf      (kBlockEdge * kBlockEdgeHalf)
+#define kComputeBlockSize (kBlockSize * 3)
 
 void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
 double InterpolateClampNegative(__global const double *array, int size, double sx);
@@ -31,64 +32,52 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     __global const float *r2, __global const float* g2, __global const float *b2,
     double* diff_xyb);
 
-__kernel void clOpsinDynamicsImage(
-    __global float *r, __global float *g, __global float *b,
-    __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred,
-    int size)
+__kernel void clConvolution(
+	__global float* result,
+	__global const float* inp, const int xsize,
+	__global const float* multipliers, const int len,
+    const int xstep, const int offset, const float border_ratio)
 {
-    const int i = get_global_id(0);
-    double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
-    double pre_mixed[3];
-    OpsinAbsorbance(pre, pre_mixed);
-
-    double sensitivity[3];
-    sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
-    sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
-    sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
+    const int ox = get_global_id(0);
+    const int y = get_global_id(1);
 
-    double cur_rgb[3] = { r[i], g[i],  b[i] };
-    double cur_mixed[3];
-    OpsinAbsorbance(cur_rgb, cur_mixed);
-    cur_mixed[0] *= sensitivity[0];
-    cur_mixed[1] *= sensitivity[1];
-    cur_mixed[2] *= sensitivity[2];
+    const int oxsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-    double x, y, z;
-    RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
-    r[i] = x;
-    g[i] = y;
-    b[i] = z;
-}
+    const int x = ox * xstep;
 
-__kernel void clMinSquareVal(__global const float* pA, __global float* pC, int square_size, int offset)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int width = get_global_size(0);
-    const int height = get_global_size(1);
+    float weight_no_border = 0;
+    for (int j = 0; j <= 2 * offset; j++)
+    {
+        weight_no_border += multipliers[j];
+    }
 
-    int minH = offset > y ? 0 : y - offset;
-    int maxH = min(y + square_size - offset, height);
+    int minx = x < offset ? 0 : x - offset;
+    int maxx = min(xsize, x + len - offset);
 
-    int minW = offset > x ? 0 : x - offset;
-    int maxW = min(x + square_size - offset, width);
+    float weight = 0.0;
+    for (int j = minx; j < maxx; j++)
+    {
+        weight += multipliers[j - x + offset];
+    }
 
-    float minValue = pA[minH * width + minW];
+    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+    float scale = 1.0 / weight;
 
-    for (int j = minH; j < maxH; j++)
+    float sum = 0.0;
+    for (int j = minx; j < maxx; j++)
     {
-        for (int i = minW; i < maxW; i++)
-        {
-            float tmp = pA[j * width + i];
-            if (tmp < minValue) minValue = tmp;
-        }
+        sum += inp[y * xsize + j] * multipliers[j - x + offset];
     }
 
-    pC[y * width + x] = minValue;
+    result[ox * ysize + y] = sum * scale;
 }
 
-__kernel void clConvolutionX(__global const float* multipliers, __global const float* inp, __global float* result,
-    int step, int len, int offset, float border_ratio)
+__kernel void clConvolutionX(
+	__global float* result,
+	__global const float* inp,
+	__global const float* multipliers, const int len,
+	const int step, const int offset, const float border_ratio)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -125,8 +114,11 @@ __kernel void clConvolutionX(__global const float* multipliers, __global const f
     result[y * xsize + x] = sum * scale;
 }
 
-__kernel void clConvolutionY(__global const float* multipliers, __global const float* inp, __global float* result,
-    int step, int len, int offset, float border_ratio)
+__kernel void clConvolutionY(
+	__global float* result,
+	__global const float* inp,
+	__global const float* multipliers, const int len,
+    const int step, const int offset, const float border_ratio)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -164,147 +156,270 @@ __kernel void clConvolutionY(__global const float* multipliers, __global const f
     result[y * xsize + x] = sum * scale;
 }
 
-__kernel void clConvolution(__global const float* multipliers, __global const float* inp, __global float* result,
-    int xsize, int xstep, int len, int offset, float border_ratio)
+__kernel void clSquareSample(
+	__global float* result,
+	__global const float* image,
+	const int xstep, const int ystep)
 {
-    const int ox = get_global_id(0);
+    const int x = get_global_id(0);
     const int y = get_global_id(1);
 
-    const int oxsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    int x_sample = x - x % xstep;
+    int y_sample = y - y % ystep;
 
-    const int x = ox * xstep;
+    if (x_sample == x && y_sample == y) return;
 
-    float weight_no_border = 0;
-    for (int j = 0; j <= 2 * offset; j++)
-    {
-        weight_no_border += multipliers[j];
-    }
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-    int minx = x < offset ? 0 : x - offset;
-    int maxx = min(xsize, x + len - offset);
+    result[y * xsize + x] = image[y_sample * xsize + x_sample];
+}
 
-    float weight = 0.0;
-    for (int j = minx; j < maxx; j++)
-    {
-        weight += multipliers[j - x + offset];
-    }
+__kernel void clOpsinDynamicsImage(
+    __global float *r, __global float *g, __global float *b,
+    __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred)
+{
+    const int i = get_global_id(0);
+    double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
+    double pre_mixed[3];
+    OpsinAbsorbance(pre, pre_mixed);
 
-    weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
-    float scale = 1.0 / weight;
+    double sensitivity[3];
+    sensitivity[0] = Gamma(pre_mixed[0]) / pre_mixed[0];
+    sensitivity[1] = Gamma(pre_mixed[1]) / pre_mixed[1];
+    sensitivity[2] = Gamma(pre_mixed[2]) / pre_mixed[2];
 
-    float sum = 0.0;
-    for (int j = minx; j < maxx; j++)
-    {
-        sum += inp[y * xsize + j] * multipliers[j - x + offset];
-    }
+    double cur_rgb[3] = { r[i], g[i],  b[i] };
+    double cur_mixed[3];
+    OpsinAbsorbance(cur_rgb, cur_mixed);
+    cur_mixed[0] *= sensitivity[0];
+    cur_mixed[1] *= sensitivity[1];
+    cur_mixed[2] *= sensitivity[2];
 
-    result[ox * ysize + y] = sum * scale;
+    double x, y, z;
+    RgbToXyb(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+    r[i] = x;
+    g[i] = y;
+    b[i] = z;
 }
 
-__kernel void clSquareSample(__global const float* pA, __global float* pC, int xstep, int ystep)
+__kernel void clMaskHighIntensityChange(
+    __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+    __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
+    __global const float *c0_x, __global const float *c0_y, __global const float *c0_b,
+    __global const float *c1_x, __global const float *c1_y, __global const float *c1_b
+)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-    int x_sample = x - x % xstep;
-    int y_sample = y - y % ystep;
+    size_t ix = y * xsize + x;
+    const double ave[3] = {
+        (c0_x[ix] + c1_x[ix]) * 0.5,
+        (c0_y[ix] + c1_y[ix]) * 0.5,
+        (c0_b[ix] + c1_b[ix]) * 0.5,
+    };
+    double sqr_max_diff = -1;
+    {
+        int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
+        int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+        for (int dir = 0; dir < 4; ++dir) {
+            if (border[dir]) {
+                continue;
+            }
+            const int ix2 = ix + offset[dir];
+            double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
+            diff *= diff;
+            if (sqr_max_diff < diff) {
+                sqr_max_diff = diff;
+            }
+        }
+    }
+    const double kReductionX = 275.19165240059317;
+    const double kReductionY = 18599.41286306991;
+    const double kReductionZ = 410.8995306951065;
+    const double kChromaBalance = 106.95800948271017;
+    double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
 
-    if (x_sample == x && y_sample == y) return;
+    const double mix[3] = {
+        chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+        kReductionY / (sqr_max_diff + kReductionY),
+        chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+    };
+    // Interpolate lineraly between the average color and the actual
+    // color -- to reduce the importance of this pixel.
+    xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
+    xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
+    xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
 
-    pC[y * xsize + x] = pA[y_sample * xsize + x_sample];
+    xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
+    xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
 
-__kernel void clDownSample(__global const float* pA, __global float* pC, int xstep, int ystep)
+__kernel void clEdgeDetectorMap(
+	__global float *result,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
+    int xsize, int ysize, int step)
 {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
 
-    const int oxsize = (xsize + xstep - 1) / xstep;
+    int pos_x = res_x * step;
+    int pos_y = res_y * step;
 
-    const int sample_x = x / xstep;
-    const int sample_y = y / ystep;
+    if (pos_x >= xsize - (8 - step)) return;
+    if (pos_y >= ysize - (8 - step)) return;
 
-    pC[y * xsize + x] = pA[sample_y * oxsize + sample_x];
-}
+    pos_x = min(pos_x, xsize - 8);
+    pos_y = min(pos_y, ysize - 8);
 
-__kernel void clScaleImage(double scale, __global float *result)
-{
-    const int i = get_global_id(0);
-    result[i] *= scale;
+    double diff_xyb[3] = { 0.0 };
+    Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize,
+        r, g, b,
+        r2, g2, b2,
+        &diff_xyb[0]);
+
+    int idx = (res_y * res_xsize + res_x) * 3;
+    result[idx] = diff_xyb[0];
+    result[idx + 1] = diff_xyb[1];
+    result[idx + 2] = diff_xyb[2];
 }
 
-__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out)
+
+__kernel void clBlockDiffMap(
+	__global float* block_diff_dc, __global float* block_diff_ac,
+	__global const float* r, __global const float* g, __global const float* b,
+    __global const float* r2, __global const float* g2, __global const float* b2,
+    int xsize, int ysize, int step)
 {
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
 
-    out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
-}
+    int pos_x = res_x * step;
+    int pos_y = res_y * step;
 
-__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    if ((pos_x + kBlockEdge - step - 1) >= xsize) return;
+    if ((pos_y + kBlockEdge - step - 1) >= ysize) return;
 
-	if (x >= xsize - s ||
-	    y >= ysize - s)
-	{
-		return;
-	}
+    size_t res_ix = res_y * res_xsize + res_x;
+    size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8);
 
-    const double mul1 = 24.8235314874;
-    out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x];
+    double block0[3 * kBlockEdge * kBlockEdge];
+    double block1[3 * kBlockEdge * kBlockEdge];
 
+    double *block0_r = &block0[0];
+    double *block0_g = &block0[kBlockEdge * kBlockEdge];
+    double *block0_b = &block0[2 * kBlockEdge * kBlockEdge];
+
+    double *block1_r = &block1[0];
+    double *block1_g = &block1[kBlockEdge * kBlockEdge];
+    double *block1_b = &block1[2 * kBlockEdge * kBlockEdge];
+
+    for (int y = 0; y < kBlockEdge; y++)
+    {
+        for (int x = 0; x < kBlockEdge; x++)
+        {
+            block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x];
+            block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x];
+            block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x];
+            block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x];
+            block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x];
+            block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x];
+        }
+    }
+
+    double diff_xyb_dc[3] = { 0.0 };
+    double diff_xyb_ac[3] = { 0.0 };
+    double diff_xyb_edge_dc[3] = { 0.0 };
+
+    ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+
+    for (int i = 0; i < 3; i++)
+    {
+        block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i];
+        block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i];
+    }
 }
 
-__kernel void clCombineChannels(
-    __global const float *mask_x, __global const float *mask_y, __global const float *mask_b,
-    __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b,
-    __global const float *block_diff_dc,
-    __global const float *block_diff_ac,
-    __global float *edge_detector_map,
-    int xsize, int ysize,
-    int res_xsize,
-    int step,
-    __global float *result)
+__kernel void clEdgeDetectorLowFreq(
+	__global float *block_diff_ac,
+    __global const float *r, __global const float *g, __global const float* b,
+    __global const float *r2, __global const float* g2, __global const float *b2,
+    int xsize, int ysize, int step)
 {
-    const int res_x = get_global_id(0) * step;
-    const int res_y = get_global_id(1) * step;
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
 
-    double mask[3];
-    double dc_mask[3];
-    mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
-    dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)];
+    if (res_x < 8 / step) return;
 
-    mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
-    dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
 
-    mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
-    dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
+    int pos_x = (res_x - (8 / step)) * step;
+    int pos_y = res_y * step;
 
-    size_t res_ix = (res_y * res_xsize + res_x) / step;
-    result[res_ix] = (float)(
-        DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
-        DotProduct(&block_diff_ac[3 * res_ix], mask) +
-        DotProduct(&edge_detector_map[3 * res_ix], mask));
+    if (pos_x + 8 >= xsize) return;
+    if (pos_y + 8 >= ysize) return;
+
+    int ix = pos_y * xsize + pos_x;
+
+    double diff[4][3];
+    __global const float* blurred0[3] = { r, g, b };
+    __global const float* blurred1[3] = { r2, g2, b2 };
+
+    for (int i = 0; i < 3; ++i) {
+        int ix2 = ix + 8;
+        diff[0][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 8 * xsize;
+        diff[1][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 6 * xsize + 6;
+        diff[2][i] =
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+        ix2 = ix + 6 * xsize - 6;
+        diff[3][i] = pos_x < 8 ? 0 :
+            ((blurred1[i][ix] - blurred0[i][ix]) +
+            (blurred0[i][ix2] - blurred1[i][ix2]));
+    }
+    double max_diff_xyb[3] = { 0 };
+    for (int k = 0; k < 4; ++k) {
+        double diff_xyb[3] = { 0 };
+        XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2],
+            0, 0, 0, 1.0,
+            diff_xyb);
+        for (int i = 0; i < 3; ++i) {
+            max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]);
+        }
+    }
+
+    int res_ix = res_y * res_xsize + res_x;
+
+    const double kMul = 10;
+
+    block_diff_ac[res_ix * 3]     += max_diff_xyb[0] * kMul;
+    block_diff_ac[res_ix * 3 + 1] += max_diff_xyb[1] * kMul;
+    block_diff_ac[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
 }
 
 __kernel void clDiffPrecompute(
+    __global float *mask_x, __global float *mask_y, __global float *mask_b,
     __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b,
-    __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b,
-    __global float *mask_x, __global float *mask_y, __global float *mask_b)
+    __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -367,99 +482,79 @@ __kernel void clDiffPrecompute(
     mask_b[ix] = (float)(m);
 }
 
-__kernel void clEdgeDetectorMap(__global float *result,
-    __global const float *r, __global const float *g, __global const float* b,
-    __global const float *r2, __global const float* g2, __global const float *b2,
-    int xsize, int ysize, int step)
+__kernel void clScaleImage(__global float *img, double scale)
 {
-    const int res_x = get_global_id(0);
-    const int res_y = get_global_id(1);
-
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
+    const int i = get_global_id(0);
+    img[i] *= scale;
+}
 
-    int pos_x = res_x * step;
-    int pos_y = res_y * step;
+#define Average5x5_w 0.679144890667f
+__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w);
+__kernel void clAverage5x5(__global float *img, __global const float *img_org)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-    if (pos_x >= xsize - (8 - step)) return;
-    if (pos_y >= ysize - (8 - step)) return;
+    const int row0 = y * xsize;
+	if (x - 1 >= 0) {
+		img[row0 + x] += img_org[row0 + x - 1];
+	}
+	if (x + 1 < xsize) {
+		img[row0 + x] += img_org[row0 + x + 1];
+	}
 
-    pos_x = min(pos_x, xsize - 8);
-    pos_y = min(pos_y, ysize - 8);
+	if (y > 0) {
+		const int rowd1 = row0 - xsize;
+		if (x - 1 >= 0) {
+			img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w;
+		}
+		img[row0 + x] += img_org[rowd1 + x];
+		if (x + 1 < xsize) {
+			img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w;
+		}
+	}
 
-    double diff_xyb[3] = { 0.0 };
-    Butteraugli8x8CornerEdgeDetectorDiff(pos_x, pos_y, xsize, ysize,
-        r, g, b,
-        r2, g2, b2,
-        &diff_xyb[0]);
+	if (y + 1 < ysize) {
+		const int rowu1 = row0 + xsize;
+		if (x - 1 >= 0) {
+			img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w;
+		}
+		img[row0 + x] += img_org[rowu1 + x];
+		if (x + 1 < xsize) {
+			img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w;
+		}
+	}
 
-    int idx = (res_y * res_xsize + res_x) * 3;
-    result[idx] = diff_xyb[0];
-    result[idx + 1] = diff_xyb[1];
-    result[idx + 2] = diff_xyb[2];
+	img[row0 + x] *= Average5x5_scale;
 }
 
-__kernel void clEdgeDetectorLowFreq(__global float *result,
-    __global const float *r, __global const float *g, __global const float* b,
-    __global const float *r2, __global const float* g2, __global const float *b2,
-    int xsize, int ysize, int step)
+__kernel void clMinSquareVal(__global float* result, __global const float* img,  int square_size, int offset)
 {
-    const int res_x = get_global_id(0);
-    const int res_y = get_global_id(1);
-
-    if (res_x < 8 / step) return;
-
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
-
-    int pos_x = (res_x - (8 / step)) * step;
-    int pos_y = res_y * step;
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int width = get_global_size(0);
+    const int height = get_global_size(1);
 
-    if (pos_x + 8 >= xsize) return;
-    if (pos_y + 8 >= ysize) return;
+    int minH = offset > y ? 0 : y - offset;
+    int maxH = min(y + square_size - offset, height);
 
-    int ix = pos_y * xsize + pos_x;
+    int minW = offset > x ? 0 : x - offset;
+    int maxW = min(x + square_size - offset, width);
 
-    double diff[4][3];
-    __global const float* blurred0[3] = { r, g, b };
-    __global const float* blurred1[3] = { r2, g2, b2 };
+    float minValue = img[minH * width + minW];
 
-    for (int i = 0; i < 3; ++i) {
-        int ix2 = ix + 8;
-        diff[0][i] =
-            ((blurred1[i][ix] - blurred0[i][ix]) +
-            (blurred0[i][ix2] - blurred1[i][ix2]));
-        ix2 = ix + 8 * xsize;
-        diff[1][i] =
-            ((blurred1[i][ix] - blurred0[i][ix]) +
-            (blurred0[i][ix2] - blurred1[i][ix2]));
-        ix2 = ix + 6 * xsize + 6;
-        diff[2][i] =
-            ((blurred1[i][ix] - blurred0[i][ix]) +
-            (blurred0[i][ix2] - blurred1[i][ix2]));
-        ix2 = ix + 6 * xsize - 6;
-        diff[3][i] = pos_x < 8 ? 0 :
-            ((blurred1[i][ix] - blurred0[i][ix]) +
-            (blurred0[i][ix2] - blurred1[i][ix2]));
-    }
-    double max_diff_xyb[3] = { 0 };
-    for (int k = 0; k < 4; ++k) {
-        double diff_xyb[3] = { 0 };
-        XybDiffLowFreqSquaredAccumulate(diff[k][0], diff[k][1], diff[k][2],
-            0, 0, 0, 1.0,
-            diff_xyb);
-        for (int i = 0; i < 3; ++i) {
-            max_diff_xyb[i] = max(max_diff_xyb[i], diff_xyb[i]);
+    for (int j = minH; j < maxH; j++)
+    {
+        for (int i = minW; i < maxW; i++)
+        {
+            float tmp = img[j * width + i];
+            if (tmp < minValue) minValue = tmp;
         }
     }
 
-    int res_ix = res_y * res_xsize + res_x;
-
-    const double kMul = 10;
-
-    result[res_ix * 3] += max_diff_xyb[0] * kMul;
-    result[res_ix * 3 + 1] += max_diff_xyb[1] * kMul;
-    result[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
+    result[y * width + x] = minValue;
 }
 
 __kernel void clDoMask(
@@ -495,118 +590,65 @@ __kernel void clDoMask(
 
 }
 
-__kernel void clBlockDiffMap(__global const float* r, __global const float* g, __global const float* b,
-    __global const float* r2, __global const float* g2, __global const float* b2,
-    __global float* block_diff_dc, __global float* block_diff_ac,
-    int xsize, int ysize, int step)
+__kernel void clCombineChannels(
+    __global float *result,
+    __global const float *mask_x, __global const float *mask_y, __global const float *mask_b,
+    __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b,
+	const int xsize, const int ysize,
+    __global const float *block_diff_dc,
+    __global const float *block_diff_ac,
+	__global float *edge_detector_map,
+    const int res_xsize,
+    const int step)
 {
-    const int res_x = get_global_id(0);
-    const int res_y = get_global_id(1);
-
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
-
-    int pos_x = res_x * step;
-    int pos_y = res_y * step;
-
-    if ((pos_x + kBlockEdge - step - 1) >= xsize) return;
-    if ((pos_y + kBlockEdge - step - 1) >= ysize) return;
-
-    size_t res_ix = res_y * res_xsize + res_x;
-    size_t offset = min(pos_y, ysize - 8) * xsize + min(pos_x, xsize - 8);
+    const int res_x = get_global_id(0) * step;
+    const int res_y = get_global_id(1) * step;
 
-    double block0[3 * kBlockEdge * kBlockEdge];
-    double block1[3 * kBlockEdge * kBlockEdge];
+    double mask[3];
+    double dc_mask[3];
+    mask[0] = mask_x[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[0] = mask_dc_x[(res_y + 3) * xsize + (res_x + 3)];
 
-    double *block0_r = &block0[0];
-    double *block0_g = &block0[kBlockEdge * kBlockEdge];
-    double *block0_b = &block0[2 * kBlockEdge * kBlockEdge];
+    mask[1] = mask_y[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[1] = mask_dc_y[(res_y + 3) * xsize + (res_x + 3)];
 
-    double *block1_r = &block1[0];
-    double *block1_g = &block1[kBlockEdge * kBlockEdge];
-    double *block1_b = &block1[2 * kBlockEdge * kBlockEdge];
+    mask[2] = mask_b[(res_y + 3) * xsize + (res_x + 3)];
+    dc_mask[2] = mask_dc_b[(res_y + 3) * xsize + (res_x + 3)];
 
-    for (int y = 0; y < kBlockEdge; y++)
-    {
-        for (int x = 0; x < kBlockEdge; x++)
-        {
-            block0_r[kBlockEdge * y + x] = r[offset + y * xsize + x];
-            block0_g[kBlockEdge * y + x] = g[offset + y * xsize + x];
-            block0_b[kBlockEdge * y + x] = b[offset + y * xsize + x];
-            block1_r[kBlockEdge * y + x] = r2[offset + y * xsize + x];
-            block1_g[kBlockEdge * y + x] = g2[offset + y * xsize + x];
-            block1_b[kBlockEdge * y + x] = b2[offset + y * xsize + x];
-        }
-    }
+    size_t res_ix = (res_y * res_xsize + res_x) / step;
+    result[res_ix] = (float)(
+        DotProduct(&block_diff_dc[3 * res_ix], dc_mask) +
+        DotProduct(&block_diff_ac[3 * res_ix], mask) +
+        DotProduct(&edge_detector_map[3 * res_ix], mask));
+}
 
-    double diff_xyb_dc[3] = { 0.0 };
-    double diff_xyb_ac[3] = { 0.0 };
-    double diff_xyb_edge_dc[3] = { 0.0 };
+__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-    ButteraugliBlockDiff(block0, block1, diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+    const int xsize = get_global_size(0);
+    const int ysize = get_global_size(1);
 
-    for (int i = 0; i < 3; i++)
-    {
-        block_diff_dc[3 * res_ix + i] = diff_xyb_dc[i];
-        block_diff_ac[3 * res_ix + i] = diff_xyb_ac[i];
-    }
+    out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
-__kernel void clMaskHighIntensityChange(
-    __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
-    __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
-    __global const float *c0_x, __global const float *c0_y, __global const float *c0_b,
-    __global const float *c1_x, __global const float *c1_y, __global const float *c1_b
-)
+__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
     const int xsize = get_global_size(0);
     const int ysize = get_global_size(1);
 
-    size_t ix = y * xsize + x;
-    const double ave[3] = {
-        (c0_x[ix] + c1_x[ix]) * 0.5,
-        (c0_y[ix] + c1_y[ix]) * 0.5,
-        (c0_b[ix] + c1_b[ix]) * 0.5,
-    };
-    double sqr_max_diff = -1;
-    {
-        int offset[4] = { -1, 1, -(int)(xsize), (int)(xsize) };
-        int border[4] = { x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
-        for (int dir = 0; dir < 4; ++dir) {
-            if (border[dir]) {
-                continue;
-            }
-            const int ix2 = ix + offset[dir];
-            double diff = 0.5 * (c0_y[ix2] + c1_y[ix2]) - ave[1];
-            diff *= diff;
-            if (sqr_max_diff < diff) {
-                sqr_max_diff = diff;
-            }
-        }
-    }
-    const double kReductionX = 275.19165240059317;
-    const double kReductionY = 18599.41286306991;
-    const double kReductionZ = 410.8995306951065;
-    const double kChromaBalance = 106.95800948271017;
-    double chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
-
-    const double mix[3] = {
-        chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
-        kReductionY / (sqr_max_diff + kReductionY),
-        chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
-    };
-    // Interpolate lineraly between the average color and the actual
-    // color -- to reduce the importance of this pixel.
-    xyb0_x[ix] = (float)(mix[0] * c0_x[ix] + (1 - mix[0]) * ave[0]);
-    xyb1_x[ix] = (float)(mix[0] * c1_x[ix] + (1 - mix[0]) * ave[0]);
+	if (x >= xsize - s ||
+	    y >= ysize - s)
+	{
+		return;
+	}
 
-    xyb0_y[ix] = (float)(mix[1] * c0_y[ix] + (1 - mix[1]) * ave[1]);
-    xyb1_y[ix] = (float)(mix[1] * c1_y[ix] + (1 - mix[1]) * ave[1]);
+    const double mul1 = 24.8235314874;
+    out[(y + s2) * xsize + x + s2] += (float)(mul1) * in[y * (xsize - s) + x];
 
-    xyb0_b[ix] = (float)(mix[2] * c0_b[ix] + (1 - mix[2]) * ave[2]);
-    xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
 
 __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
@@ -642,47 +684,7 @@ __kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int
     }
 }
 
-#define Average5x5_w 0.679144890667f
-__constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w);
-__kernel void clAverage5x5(__global float *img, __global const float *img_org)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
-
-    const int row0 = y * xsize;
-	if (x - 1 >= 0) {
-		img[row0 + x] += img_org[row0 + x - 1];
-	}
-	if (x + 1 < xsize) {
-		img[row0 + x] += img_org[row0 + x + 1];
-	}
-
-	if (y > 0) {
-		const int rowd1 = row0 - xsize;
-		if (x - 1 >= 0) {
-			img[row0 + x] += img_org[rowd1 + x - 1] * Average5x5_w;
-		}
-		img[row0 + x] += img_org[rowd1 + x];
-		if (x + 1 < xsize) {
-			img[row0 + x] += img_org[rowd1 + x + 1] * Average5x5_w;
-		}
-	}
-
-	if (y + 1 < ysize) {
-		const int rowu1 = row0 + xsize;
-		if (x - 1 >= 0) {
-			img[row0 + x] += img_org[rowu1 + x - 1] * Average5x5_w;
-		}
-		img[row0 + x] += img_org[rowu1 + x];
-		if (x + 1 < xsize) {
-			img[row0 + x] += img_org[rowu1 + x + 1] * Average5x5_w;
-		}
-	}
 
-	img[row0 + x] *= Average5x5_scale;
-}
 
 void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_x,
@@ -3138,7 +3140,6 @@ __kernel void clComputeBlockZeroingOrder(
 {
     const int block_x = get_global_id(0);
     const int block_y = get_global_id(1);
-#define kComputeBlockSize (kBlockSize * 3)
 
     channel_info orig_channel[3];
     orig_channel[0].coeff = orig_batch_0;
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 67a7f2a0..3f5e46ff 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -49,7 +49,6 @@ ocl_args_d_t& getOcl(void)
 	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionX", &err);
 	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionY", &err);
 	ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSample", &err);
-	ocl.kernel[KERNEL_DOWNSAMPLE] =   clCreateKernel(ocl.program, "clDownSample", &err);
 	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImage", &err);
 	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMask", &err);
 	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImage", &err);
@@ -313,12 +312,12 @@ void clConvolutionEx(
 	cl_float clborder_ratio = border_ratio;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cllen);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
+    clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxsize);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&multipliers);
+    clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&clxstep);
 	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
 	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
 
@@ -350,11 +349,11 @@ void clConvolutionX(
 	cl_float clborder_ratio = border_ratio;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers);
+    clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep);
 	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset);
 	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
 
@@ -387,11 +386,11 @@ void clConvolutionY(
 	cl_float clborder_ratio = border_ratio;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&multipliers);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&xstep);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers);
+    clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep);
 	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset);
 	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
 
@@ -408,7 +407,7 @@ void clConvolutionY(
 	}
 }
 
-void clUpsampleEx2(
+void clSquareSampleEx(
     cl_mem result/*out*/,
     const cl_mem image, size_t xsize, size_t ysize,
 	size_t xstep, size_t ystep)
@@ -419,38 +418,8 @@ void clUpsampleEx2(
 	cl_int clxstep = xstep;
 	cl_int clystep = ystep;
 	cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
-
-	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
-}
-
-void clUpsampleEx(
-    cl_mem result/*out*/,
-    const cl_mem image,
-    const size_t xsize, const size_t ysize,
-    const size_t xstep, const size_t ystep)
-{
-	cl_int err = CL_SUCCESS;
-	ocl_args_d_t &ocl = getOcl();
-
-	cl_int clxstep = xstep;
-	cl_int clystep = ystep;
-	cl_kernel kernel = ocl.kernel[KERNEL_DOWNSAMPLE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&result);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&image);
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep);
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
 
@@ -537,7 +506,7 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 		ocl.allocA(sizeof(cl_float) * xsize * ysize);
 		clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
 		clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-		clUpsampleEx2(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
 	}
 	else
 	{
@@ -563,7 +532,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
 	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
-	cl_int clSize = xsize * ysize;
 	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g);
@@ -571,7 +539,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&rgb_blurred.r);
 	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&rgb_blurred.g);
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&clSize);
 
 	size_t globalWorkSize[1] = { xsize * ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -710,14 +677,14 @@ void clBlockDiffMapEx(
 	cl_int clstep = step;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &rgb.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb.g);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb2.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2.g);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), &block_diff_dc);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), &block_diff_ac);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_dc);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), &block_diff_ac);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.r);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb.g);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb.b);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.r);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2.g);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), &rgb2.b);
 	clSetKernelArg(kernel, 8, sizeof(cl_int), &clxsize);
 	clSetKernelArg(kernel, 9, sizeof(cl_int), &clysize);
 	clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep);
@@ -802,15 +769,15 @@ void clDiffPrecomputeEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.x);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.y);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.x);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.y);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask.x);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mask.y);
-	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mask.b);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.x);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.y);
+    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb0.x);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb0.y);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb0.b);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb1.x);
+	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb1.y);
+	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -833,8 +800,8 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 	cl_double clscale = w;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_double), (void*)&clscale);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
+	clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&clscale);
 
 	size_t globalWorkSize[1] = { size };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -851,35 +818,35 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 
 void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize)
 {
-	if (xsize < 4 || ysize < 4) {
-		// TODO: Make this work for small dimensions as well.
-		return;
-	}
+    if (xsize < 4 || ysize < 4) {
+	    // TODO: Make this work for small dimensions as well.
+	    return;
+    }
 
-	cl_int err = CL_SUCCESS;
-	ocl_args_d_t &ocl = getOcl();
+    cl_int err = CL_SUCCESS;
+    ocl_args_d_t &ocl = getOcl();
 
-	size_t len = xsize * ysize * sizeof(float);
-	ocl.allocA(len);
-	cl_mem tmp = ocl.srcA;
+    size_t len = xsize * ysize * sizeof(float);
+    ocl.allocA(len);
+    cl_mem img_org = ocl.srcA;
 
-	err = clEnqueueCopyBuffer(ocl.commandQueue, img, tmp, 0, 0, len, 0, NULL, NULL);
+    err = clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL);
 
-  cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
-  clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
-  clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&tmp);
+    cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img_org);
 
-  size_t globalWorkSize[2] = { xsize, ysize };
-  err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-  if (CL_SUCCESS != err)
-  {
+    size_t globalWorkSize[2] = { xsize, ysize };
+    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    if (CL_SUCCESS != err)
+    {
     LogError("Error: clAverage5x5Ex() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-  }
-  err = clFinish(ocl.commandQueue);
-  if (CL_SUCCESS != err)
-  {
+    }
+    err = clFinish(ocl.commandQueue);
+    if (CL_SUCCESS != err)
+    {
     LogError("Error: clAverage5x5Ex() clFinish returned %s.\n", TranslateOpenCLError(err));
-  }
+    }
 }
 
 void clMinSquareValEx(
@@ -895,8 +862,8 @@ void clMinSquareValEx(
 	ocl.allocA(sizeof(cl_float) * xsize * ysize);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&ocl.srcA);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset);
 
@@ -1105,20 +1072,20 @@ void clCombineChannelsEx(
 	cl_int clstep = step;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&block_diff_dc);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&block_diff_ac);
-	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&edge_detector_map);
-	clSetKernelArg(kernel, 9, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 10, sizeof(cl_int), (void*)&clysize);
-	clSetKernelArg(kernel, 11, sizeof(cl_int), (void*)&clres_size);
-	clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clstep);
-	clSetKernelArg(kernel, 13, sizeof(cl_mem), (void*)&result);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.r);
+	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.g);
+	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask.b);
+	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.r);
+	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.g);
+	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask_dc.b);
+    clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clxsize);
+    clSetKernelArg(kernel, 8, sizeof(cl_int), (void*)&clysize);
+	clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&block_diff_dc);
+	clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&block_diff_ac);
+	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&edge_detector_map);
+	clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_size);
+	clSetKernelArg(kernel, 13, sizeof(cl_int), (void*)&clstep);
 
 	size_t globalWorkSize[2] = { work_xsize, work_ysize };
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index f1aa6c22..de90c9b0 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -58,7 +58,7 @@ void clConvolutionY(
     const cl_mem multipliers, size_t len,
     int xstep, int offset, double border_ratio);
 
-void clUpsampleEx2(
+void clSquareSampleEx(
     cl_mem result/*out*/,
     const cl_mem image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep);
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 28ae9d1b..a19121c1 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -330,34 +330,6 @@ void tclConvolution(size_t xsize, size_t ysize,
 	clReleaseMemObject(m);
 }
 
-// chirsk todo
-void tclUpsample(float* image, size_t xsize, size_t ysize,
-	size_t xstep, size_t ystep,
-	float* result)
-{
-	int dxsize = (xsize + xstep - 1) / xstep;
-	int dysize = (ysize + ystep - 1) / ystep;
-	size_t img_size = dxsize * dysize * sizeof(float);
-	size_t result_size = xsize * ysize * sizeof(float);
-	cl_int err = 0;
-	ocl_args_d_t &ocl = getOcl();
-	cl_mem img = ocl.allocMem(img_size, image);
-	ocl.allocA(result_size);
-	cl_mem r = ocl.srcA;
-
-	clUpsampleEx(r, img, xsize, ysize, xstep, ystep);
-
-	cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, r, true, CL_MAP_READ, 0, result_size, 0, NULL, NULL, &err);
-	err = clFinish(ocl.commandQueue);
-
-	FLOAT_COMPARE(result, r_r, xsize * ysize);
-
-	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
-	clReleaseMemObject(img);
-}
-
 // ian todo
 void tclDiffPrecompute(
   const const std::vector<std::vector<float> > &xyb0,
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index a84b94ac..b27c7942 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -72,7 +72,3 @@ void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_
 void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	const float *result);
-
-void tclUpsample(const float* image, size_t xsize, size_t ysize,
-	size_t xstep, size_t ystep,
-	const float* result);
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index ed2f1ee2..802ded26 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -50,7 +50,6 @@ enum KernelName {
 	KERNEL_CONVOLUTIONX,
 	KERNEL_CONVOLUTIONY,
 	KERNEL_SQUARESAMPLE,
-	KERNEL_DOWNSAMPLE,
 	KERNEL_OPSINDYNAMICSIMAGE,
 	KERNEL_DOMASK,
 	KERNEL_SCALEIMAGE,

From f5fcd1bd458e6e105dd59fa1ffa78a2cd52fa9af Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 25 May 2017 13:37:50 +0800
Subject: [PATCH 112/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 clguetzli/clguetzli.cpp      | 5 +++++
 clguetzli/clguetzli_test.cpp | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 3f5e46ff..16d614d9 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -200,6 +200,9 @@ void clComputeBlockZeroingOrder(
     cl_int clFactor = factor;
     cl_int clMask = comp_mask;
 
+	clEnqueueWriteBuffer(ocl.commandQueue, mem_output_order_batch, CL_FALSE, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL);
+	err = clFinish(ocl.commandQueue);
+
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
@@ -443,6 +446,7 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
     clBlurEx2(image, xsize, ysize, sigma, border_ratio, result);
 
     return;
+/*
     double m = 2.25;  // Accuracy increases when m is increased.
     const double scaler = -1.0 / (2 * sigma * sigma);
     // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
@@ -479,6 +483,7 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
     }
 
     clReleaseMemObject(mem_expn);
+*/
 }
 
 void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index a19121c1..bbfdb970 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -332,8 +332,8 @@ void tclConvolution(size_t xsize, size_t ysize,
 
 // ian todo
 void tclDiffPrecompute(
-  const const std::vector<std::vector<float> > &xyb0,
-  const const std::vector<std::vector<float> > &xyb1,
+  const std::vector<std::vector<float> > &xyb0,
+  const std::vector<std::vector<float> > &xyb1,
   size_t xsize, size_t ysize,
   const std::vector<std::vector<float> > *mask_cmp)
 {

From bb1e067909222e135e654eb12b7dbc9337a5b5db Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 25 May 2017 17:16:49 +0800
Subject: [PATCH 113/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=AD=A3=E5=8F=82=E6=95=B0=E4=BC=A0=E9=80=92?=
 =?UTF-8?q?=E8=A7=84=E5=88=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  | 371 +++++++++++++++++++++-------------------
 clguetzli/clguetzli.cpp |  64 +++----
 clguetzli/clguetzli.h   |  10 +-
 clguetzli/ocl.h         |  24 +--
 4 files changed, 240 insertions(+), 229 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 9639c018..644a009a 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -9,6 +9,19 @@
 #define kBlockHalf      (kBlockEdge * kBlockEdgeHalf)
 #define kComputeBlockSize (kBlockSize * 3)
 
+// IntFloatPair��Ϊ��ģ��output_order input_order��vector
+typedef struct __IntFloatPair
+{
+    int   idx;
+    float err;
+}IntFloatPair, DCTScoreData, CoeffData;
+
+typedef struct __IntFloatPairList
+{
+    int size;
+    IntFloatPair *pData;
+}IntFloatPairList;
+
 void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
 double InterpolateClampNegative(__global const double *array, int size, double sx);
 void   XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
@@ -32,7 +45,25 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     __global const float *r2, __global const float* g2, __global const float *b2,
     double* diff_xyb);
 
-__kernel void clConvolution(
+int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order);
+
+double CompareBlockFactor(const channel_info mayout_channel[3],
+                        const coeff_t* candidate_block,
+                        const int block_x,
+                        const int block_y,
+                        __global const float *orig_image_batch,
+                        __global const float *mask_scale,
+                        const int image_width,
+                        const int image_height,
+                        const int factor);
+
+void floatcopy(float *dst, const float *src, int size);
+void coeffcopy(coeff_t *dst, const coeff_t *src, int size);
+void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size);
+int list_erase(IntFloatPairList* list, int idx);
+int list_push_back(IntFloatPairList* list, int i, float f);
+
+__kernel void clConvolutionEx(
 	__global float* result,
 	__global const float* inp, const int xsize,
 	__global const float* multipliers, const int len,
@@ -73,7 +104,7 @@ __kernel void clConvolution(
     result[ox * ysize + y] = sum * scale;
 }
 
-__kernel void clConvolutionX(
+__kernel void clConvolutionXEx(
 	__global float* result,
 	__global const float* inp,
 	__global const float* multipliers, const int len,
@@ -114,7 +145,7 @@ __kernel void clConvolutionX(
     result[y * xsize + x] = sum * scale;
 }
 
-__kernel void clConvolutionY(
+__kernel void clConvolutionYEx(
 	__global float* result,
 	__global const float* inp,
 	__global const float* multipliers, const int len,
@@ -156,7 +187,7 @@ __kernel void clConvolutionY(
     result[y * xsize + x] = sum * scale;
 }
 
-__kernel void clSquareSample(
+__kernel void clSquareSampleEx(
 	__global float* result,
 	__global const float* image,
 	const int xstep, const int ystep)
@@ -175,7 +206,7 @@ __kernel void clSquareSample(
     result[y * xsize + x] = image[y_sample * xsize + x_sample];
 }
 
-__kernel void clOpsinDynamicsImage(
+__kernel void clOpsinDynamicsImageEx(
     __global float *r, __global float *g, __global float *b,
     __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred)
 {
@@ -203,7 +234,7 @@ __kernel void clOpsinDynamicsImage(
     b[i] = z;
 }
 
-__kernel void clMaskHighIntensityChange(
+__kernel void clMaskHighIntensityChangeEx(
     __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
     __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
     __global const float *c0_x, __global const float *c0_y, __global const float *c0_b,
@@ -260,7 +291,7 @@ __kernel void clMaskHighIntensityChange(
     xyb1_b[ix] = (float)(mix[2] * c1_b[ix] + (1 - mix[2]) * ave[2]);
 }
 
-__kernel void clEdgeDetectorMap(
+__kernel void clEdgeDetectorMapEx(
 	__global float *result,
     __global const float *r, __global const float *g, __global const float* b,
     __global const float *r2, __global const float* g2, __global const float *b2,
@@ -294,7 +325,7 @@ __kernel void clEdgeDetectorMap(
 }
 
 
-__kernel void clBlockDiffMap(
+__kernel void clBlockDiffMapEx(
 	__global float* block_diff_dc, __global float* block_diff_ac,
 	__global const float* r, __global const float* g, __global const float* b,
     __global const float* r2, __global const float* g2, __global const float* b2,
@@ -352,7 +383,7 @@ __kernel void clBlockDiffMap(
     }
 }
 
-__kernel void clEdgeDetectorLowFreq(
+__kernel void clEdgeDetectorLowFreqEx(
 	__global float *block_diff_ac,
     __global const float *r, __global const float *g, __global const float* b,
     __global const float *r2, __global const float* g2, __global const float *b2,
@@ -416,7 +447,7 @@ __kernel void clEdgeDetectorLowFreq(
     block_diff_ac[res_ix * 3 + 2] += max_diff_xyb[2] * kMul;
 }
 
-__kernel void clDiffPrecompute(
+__kernel void clDiffPrecomputeEx(
     __global float *mask_x, __global float *mask_y, __global float *mask_b,
     __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b,
     __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b)
@@ -482,7 +513,7 @@ __kernel void clDiffPrecompute(
     mask_b[ix] = (float)(m);
 }
 
-__kernel void clScaleImage(__global float *img, double scale)
+__kernel void clScaleImageEx(__global float *img, double scale)
 {
     const int i = get_global_id(0);
     img[i] *= scale;
@@ -490,7 +521,7 @@ __kernel void clScaleImage(__global float *img, double scale)
 
 #define Average5x5_w 0.679144890667f
 __constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w);
-__kernel void clAverage5x5(__global float *img, __global const float *img_org)
+__kernel void clAverage5x5Ex(__global float *img, __global const float *img_org)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -530,7 +561,7 @@ __kernel void clAverage5x5(__global float *img, __global const float *img_org)
 	img[row0 + x] *= Average5x5_scale;
 }
 
-__kernel void clMinSquareVal(__global float* result, __global const float* img,  int square_size, int offset)
+__kernel void clMinSquareValEx(__global float* result, __global const float* img,  int square_size, int offset)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -557,7 +588,7 @@ __kernel void clMinSquareVal(__global float* result, __global const float* img,
     result[y * width + x] = minValue;
 }
 
-__kernel void clDoMask(
+__kernel void clDoMaskEx(
     __global float *mask_x, __global float *mask_y, __global float *mask_b,
     __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
     __global const double *lut_x, __global const double *lut_y, __global const double *lut_b,
@@ -590,7 +621,7 @@ __kernel void clDoMask(
 
 }
 
-__kernel void clCombineChannels(
+__kernel void clCombineChannelsEx(
     __global float *result,
     __global const float *mask_x, __global const float *mask_y, __global const float *mask_b,
     __global const float *mask_dc_x, __global const float *mask_dc_y, __global const float *mask_dc_b,
@@ -622,7 +653,40 @@ __kernel void clCombineChannels(
         DotProduct(&edge_detector_map[3 * res_ix], mask));
 }
 
-__kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int s2, __global float *out)
+__kernel void clUpsampleSquareRootEx(__global float *diffmap_out, __global const float *diffmap, int xsize, int ysize, int step)
+{
+    const int res_x = get_global_id(0);
+    const int res_y = get_global_id(1);
+
+    const int res_xsize = get_global_size(0);
+    const int res_ysize = get_global_size(1);
+
+    const int pos_x = res_x * step;
+    const int pos_y = res_y * step;
+
+    if (pos_y + 8 - step >= ysize) return;
+    if (pos_x + 8 - step >= xsize) return;
+
+    int s2 = (8 - step) / 2;
+
+    // Upsample and take square root.
+    float orig_val = diffmap[res_y * res_xsize + res_x];
+
+    const float kInitialSlope = 100;
+    // TODO(b/29974893): Until that is fixed do not call sqrt on very small
+    // numbers.
+    double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+        ? kInitialSlope * orig_val
+        : sqrt(orig_val);
+
+    for (size_t off_y = 0; off_y < step; ++off_y) {
+        for (size_t off_x = 0; off_x < step; ++off_x) {
+            diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val;
+        }
+    }
+}
+
+__kernel void clRemoveBorderEx(__global float *out, __global const float *in, int in_xsize, int s, int s2)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -633,7 +697,7 @@ __kernel void clRemoveBorder(__global const float *in, int in_xsize, int s, int
     out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
-__kernel void clAddBorder(__global float *out, int s, int s2, __global const float *in)
+__kernel void clAddBorderEx(__global float *out, int s, int s2, __global const float *in)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -651,40 +715,131 @@ __kernel void clAddBorder(__global float *out, int s, int s2, __global const flo
 
 }
 
-__kernel void clUpsampleSquareRoot(__global const float *diffmap, int xsize, int ysize, int step, __global float *diffmap_out)
+// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
+__kernel void clComputeBlockZeroingOrderEx(
+    __global const coeff_t *orig_batch_0,       // ԭʼͼ��ϵ��
+    __global const coeff_t *orig_batch_1,       // ԭʼͼ��ϵ��
+    __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
+    __global const float   *orig_image_batch,   // ԭʼͼ��pregamma
+    __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
+    const int              image_width,
+    const int              image_height,
+
+    __global const coeff_t *mayout_batch_0,     // �����ѡͼ��ϵ��
+    __global const coeff_t *mayout_batch_1,     // �����ѡͼ��ϵ��
+    __global const coeff_t *mayout_batch_2,     // �����ѡͼ��ϵ��
+    __global const ushort  *mayout_pixel_0,
+    __global const ushort  *mayout_pixel_1,
+    __global const ushort  *mayout_pixel_2,
+
+    const channel_info     mayout_channel_0,
+    const channel_info     mayout_channel_1,
+    const channel_info     mayout_channel_2,
+    const int factor,                                 // ��ǰ���������factor
+    const int comp_mask,                              // ��ǰ���������channel
+    const float BlockErrorLimit,
+    __global CoeffData *output_order_list/*out*/)
 {
-    const int res_x = get_global_id(0);
-    const int res_y = get_global_id(1);
+    const int block_x = get_global_id(0);
+    const int block_y = get_global_id(1);
 
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
+    channel_info orig_channel[3];
+    orig_channel[0].coeff = orig_batch_0;
+    orig_channel[1].coeff = orig_batch_1;
+    orig_channel[2].coeff = orig_batch_2;
 
-    const int pos_x = res_x * step;
-    const int pos_y = res_y * step;
+    channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 };
+    mayout_channel[0].coeff = mayout_batch_0;
+    mayout_channel[1].coeff = mayout_batch_1;
+    mayout_channel[2].coeff = mayout_batch_2;
+    mayout_channel[0].pixel = mayout_pixel_0;
+    mayout_channel[1].pixel = mayout_pixel_1;
+    mayout_channel[2].pixel = mayout_pixel_2;
 
-    if (pos_y + 8 - step >= ysize) return;
-    if (pos_x + 8 - step >= xsize) return;
+    int block_idx = 0;        // ��������mask���е�channel������indx
 
-    int s2 = (8 - step) / 2;
+    coeff_t mayout_block[kComputeBlockSize] = { 0 };
+    coeff_t orig_block[kComputeBlockSize]   = { 0 };
 
-    // Upsample and take square root.
-    float orig_val = diffmap[res_y * res_xsize + res_x];
+    for (int c = 0; c < 3; c++) {
+        if (comp_mask & (1<<c)) {
+            block_idx = block_y * mayout_channel[c].block_width + block_x;
+            coeffcopy_g(&mayout_block[c * kBlockSize],
+                mayout_channel[c].coeff + block_idx * kBlockSize,
+                kBlockSize);
+            coeffcopy_g(&orig_block[c * kBlockSize],
+                orig_channel[c].coeff + block_idx * kBlockSize,
+                kBlockSize);
+        }
+    }
 
-    const float kInitialSlope = 100;
-    // TODO(b/29974893): Until that is fixed do not call sqrt on very small
-    // numbers.
-    double val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
-        ? kInitialSlope * orig_val
-        : sqrt(orig_val);
+    DCTScoreData input_order_data[kComputeBlockSize];
+    CoeffData    output_order_data[kComputeBlockSize];
 
-    for (size_t off_y = 0; off_y < step; ++off_y) {
-        for (size_t off_x = 0; off_x < step; ++off_x) {
-            diffmap_out[(pos_y + off_y + s2) * xsize + pos_x + off_x + s2] = val;
+    IntFloatPairList input_order = { 0, input_order_data };
+    IntFloatPairList output_order = { 0, output_order_data };
+
+    int count = MakeInputOrderEx(mayout_block, orig_block, &input_order);
+
+    coeff_t processed_block[kComputeBlockSize];
+    coeffcopy(processed_block, mayout_block, kComputeBlockSize);
+
+    while (input_order.size > 0)
+    {
+        float best_err = 1e17f;
+        int best_i = 0;
+        for (int i = 0; i < min(3, input_order.size); i++)
+        {
+            coeff_t candidate_block[kComputeBlockSize];
+            coeffcopy(candidate_block, processed_block, kComputeBlockSize);
+
+            const int idx = input_order.pData[i].idx;
+            candidate_block[idx] = 0;
+
+            float max_err = CompareBlockFactor(mayout_channel,
+                                               candidate_block,
+                                               block_x,
+                                               block_y,
+                                               orig_image_batch,
+                                               mask_scale,
+                                               image_width,
+                                               image_height,
+                                               factor);
+            if (max_err < best_err)
+            {
+                best_err = max_err;
+                best_i = i;
+            }
         }
+
+        int idx = input_order.pData[best_i].idx;
+        processed_block[idx] = 0;
+        list_erase(&input_order, best_i);
+
+        list_push_back(&output_order, idx, best_err);
+    }
+
+    // ע��output_order�����resize���ǰ�β������λ0
+    float min_err = 1e10;
+    for (int i = output_order.size - 1; i >= 0; --i) {
+        min_err = min(min_err, output_order.pData[i].err);
+        output_order.pData[i].err = min_err;
     }
-}
 
+    __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
 
+    int out_count = 0;
+    for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
+    {
+        // ���˽ϴ��err���ⲿ�ֽ����˼���û������
+        if (output_order.pData[i].err <= BlockErrorLimit)
+        {
+            output_block[out_count].idx = output_order.pData[i].idx;
+            output_block[out_count].err = output_order.pData[i].err;
+            out_count++;
+        }
+    }
+}
 
 void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_x,
@@ -1394,19 +1549,6 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *
     *valz = b;
 }
 
-// IntFloatPair��Ϊ��ģ��output_order input_order��vector�����Ǵ�С�̶�Ϊ8x8
-typedef struct __IntFloatPair
-{
-    int   idx;
-    float err;
-}IntFloatPair, DCTScoreData, CoeffData;
-
-typedef struct __IntFloatPairList
-{
-    int size;
-    IntFloatPair *pData;
-}IntFloatPairList;
-
 // chrisk todo
 // return size
 int list_push_back(IntFloatPairList* list, int i, float f)
@@ -3113,128 +3255,3 @@ double CompareBlockFactor(const channel_info mayout_channel[3],
     }
 }
 
-// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
-__kernel void clComputeBlockZeroingOrder(
-    __global const coeff_t *orig_batch_0,       // ԭʼͼ��ϵ��
-    __global const coeff_t *orig_batch_1,       // ԭʼͼ��ϵ��
-    __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
-    __global const float   *orig_image_batch,   // ԭʼͼ��pregamma
-    __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
-    const int              image_width,
-    const int              image_height,
-
-    __global const coeff_t *mayout_batch_0,     // �����ѡͼ��ϵ��
-    __global const coeff_t *mayout_batch_1,     // �����ѡͼ��ϵ��
-    __global const coeff_t *mayout_batch_2,     // �����ѡͼ��ϵ��
-    __global const ushort  *mayout_pixel_0,
-    __global const ushort  *mayout_pixel_1,
-    __global const ushort  *mayout_pixel_2,
-
-    const channel_info     mayout_channel_0,
-    const channel_info     mayout_channel_1,
-    const channel_info     mayout_channel_2,
-    const int factor,                                 // ��ǰ���������factor
-    const int comp_mask,                              // ��ǰ���������channel
-    const float BlockErrorLimit,
-    __global CoeffData *output_order_list/*out*/)
-{
-    const int block_x = get_global_id(0);
-    const int block_y = get_global_id(1);
-
-    channel_info orig_channel[3];
-    orig_channel[0].coeff = orig_batch_0;
-    orig_channel[1].coeff = orig_batch_1;
-    orig_channel[2].coeff = orig_batch_2;
-
-    channel_info mayout_channel[3] = { mayout_channel_0, mayout_channel_1, mayout_channel_2 };
-    mayout_channel[0].coeff = mayout_batch_0;
-    mayout_channel[1].coeff = mayout_batch_1;
-    mayout_channel[2].coeff = mayout_batch_2;
-    mayout_channel[0].pixel = mayout_pixel_0;
-    mayout_channel[1].pixel = mayout_pixel_1;
-    mayout_channel[2].pixel = mayout_pixel_2;
-
-    int block_idx = 0;        // ��������mask���е�channel������indx
-
-    coeff_t mayout_block[kComputeBlockSize] = { 0 };
-    coeff_t orig_block[kComputeBlockSize]   = { 0 };
-
-    for (int c = 0; c < 3; c++) {
-        if (comp_mask & (1<<c)) {
-            block_idx = block_y * mayout_channel[c].block_width + block_x;
-            coeffcopy_g(&mayout_block[c * kBlockSize],
-                mayout_channel[c].coeff + block_idx * kBlockSize,
-                kBlockSize);
-            coeffcopy_g(&orig_block[c * kBlockSize],
-                orig_channel[c].coeff + block_idx * kBlockSize,
-                kBlockSize);
-        }
-    }
-
-    DCTScoreData input_order_data[kComputeBlockSize];
-    CoeffData    output_order_data[kComputeBlockSize];
-
-    IntFloatPairList input_order = { 0, input_order_data };
-    IntFloatPairList output_order = { 0, output_order_data };
-
-    int count = MakeInputOrderEx(mayout_block, orig_block, &input_order);
-
-    coeff_t processed_block[kComputeBlockSize];
-    coeffcopy(processed_block, mayout_block, kComputeBlockSize);
-
-    while (input_order.size > 0)
-    {
-        float best_err = 1e17f;
-        int best_i = 0;
-        for (int i = 0; i < min(3, input_order.size); i++)
-        {
-            coeff_t candidate_block[kComputeBlockSize];
-            coeffcopy(candidate_block, processed_block, kComputeBlockSize);
-
-            const int idx = input_order.pData[i].idx;
-            candidate_block[idx] = 0;
-
-            float max_err = CompareBlockFactor(mayout_channel,
-                                               candidate_block,
-                                               block_x,
-                                               block_y,
-                                               orig_image_batch,
-                                               mask_scale,
-                                               image_width,
-                                               image_height,
-                                               factor);
-            if (max_err < best_err)
-            {
-                best_err = max_err;
-                best_i = i;
-            }
-        }
-
-        int idx = input_order.pData[best_i].idx;
-        processed_block[idx] = 0;
-        list_erase(&input_order, best_i);
-
-        list_push_back(&output_order, idx, best_err);
-    }
-
-    // ע��output_order�����resize���ǰ�β������λ0
-    float min_err = 1e10;
-    for (int i = output_order.size - 1; i >= 0; --i) {
-        min_err = min(min_err, output_order.pData[i].err);
-        output_order.pData[i].err = min_err;
-    }
-
-    __global CoeffData *output_block = output_order_list + block_idx * kComputeBlockSize;
-
-    int out_count = 0;
-    for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
-    {
-        // ���˽ϴ��err���ⲿ�ֽ����˼���û������
-        if (output_order.pData[i].err <= BlockErrorLimit)
-        {
-            output_block[out_count].idx = output_order.pData[i].idx;
-            output_block[out_count].err = output_order.pData[i].err;
-            out_count++;
-        }
-    }
-}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 16d614d9..b58f7dc8 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -44,25 +44,25 @@ ocl_args_d_t& getOcl(void)
             LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
         }
 	}
-	ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareVal", &err);
-	ocl.kernel[KERNEL_CONVOLUTION] =  clCreateKernel(ocl.program, "clConvolution", &err);
-	ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionX", &err);
-	ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionY", &err);
-	ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSample", &err);
-	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImage", &err);
-	ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMask", &err);
-	ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImage", &err);
-	ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannels", &err);
-	ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChange", &err);
-	ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecompute", &err);
-	ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRoot", &err);
-	ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorder", &err);
-	ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorder", &err);
-	ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5", &err);
-	ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMap", &err);
-	ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMap", &err);
-	ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreq", &err);
-    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrder", &err);
+    ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err);
+    ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err);
+    ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err);
+    ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err);
+	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err);
+    ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err);
+    ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err);
+    ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err);
+    ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err);
+    ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err);
+    ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err);
+    ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err);
+    ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err);
+    ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err);
+    ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err);
+    ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err);
+    ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err);
+    ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err);
 
 	return ocl;
 }
@@ -1114,14 +1114,14 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi
 	cl_int clysize = ysize;
 	cl_int clstep = step;
 
-    cl_mem mem_diffmap = ocl.allocMem(xsize * ysize * sizeof(float));
+    cl_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
 
 	cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap);
-	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&xsize);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&ysize);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&step);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_diffmap);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap_out);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&diffmap);
+	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&xsize);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&ysize);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&step);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -1133,7 +1133,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi
 		LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
 	}
 	err = clFinish(ocl.commandQueue);
-	err = clEnqueueCopyBuffer(ocl.commandQueue, mem_diffmap, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, diffmap_out, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
@@ -1144,7 +1144,7 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi
 		LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err));
 	}
 
-    clReleaseMemObject(mem_diffmap);
+    clReleaseMemObject(diffmap_out);
 }
 
 void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step)
@@ -1156,11 +1156,11 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz
 	cl_int cls2 = (8 - step) / 2;
     cl_int clxsize = xsize;
 	cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &in);
-    clSetKernelArg(kernel, 1, sizeof(cl_int), &clxsize);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), &cls);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), &cls2);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), &out);
+	clSetKernelArg(kernel, 1, sizeof(cl_mem), &in);
+    clSetKernelArg(kernel, 2, sizeof(cl_int), &clxsize);
+	clSetKernelArg(kernel, 3, sizeof(cl_int), &cls);
+	clSetKernelArg(kernel, 4, sizeof(cl_int), &cls2);
 
 	size_t globalWorkSize[2] = { xsize - cls, ysize - cls};
 	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index de90c9b0..b5997fcd 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -46,13 +46,13 @@ void clConvolutionEx(
     const cl_mem multipliers, size_t len,
     int xstep, int offset, double border_ratio);
 
-void clConvolutionX(
+void clConvolutionXEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
     int xstep, int offset, double border_ratio);
 
-void clConvolutionY(
+void clConvolutionYEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
@@ -63,12 +63,6 @@ void clSquareSampleEx(
     const cl_mem image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep);
 
-void clUpsampleEx(
-    cl_mem result/*out*/,
-    const cl_mem image,
-    const size_t xsize, const size_t ysize,
-    const size_t xstep, const size_t ystep);
-
 void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
     const double sigma, const double border_ratio,
     cl_mem result = nullptr/*out, opt*/);
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 802ded26..04407f5c 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -45,25 +45,25 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
 */
 
 enum KernelName {
-	KERNEL_MINSQUAREVAL = 0,
-	KERNEL_CONVOLUTION,
+	KERNEL_CONVOLUTION = 0,
 	KERNEL_CONVOLUTIONX,
 	KERNEL_CONVOLUTIONY,
 	KERNEL_SQUARESAMPLE,
 	KERNEL_OPSINDYNAMICSIMAGE,
-	KERNEL_DOMASK,
-	KERNEL_SCALEIMAGE,
+    KERNEL_MASKHIGHINTENSITYCHANGE,
+    KERNEL_EDGEDETECTOR,
+    KERNEL_BLOCKDIFFMAP,
+    KERNEL_EDGEDETECTORLOWFREQ,
+    KERNEL_DIFFPRECOMPUTE,
+    KERNEL_SCALEIMAGE,
+    KERNEL_AVERAGE5X5,
+    KERNEL_MINSQUAREVAL,
+    KERNEL_DOMASK,
 	KERNEL_COMBINECHANNELS,
-	KERNEL_MASKHIGHINTENSITYCHANGE,
-	KERNEL_DIFFPRECOMPUTE,
 	KERNEL_UPSAMPLESQUAREROOT,
+    KERNEL_REMOVEBORDER,
 	KERNEL_ADDBORDER,
-	KERNEL_REMOVEBORDER,
-    KERNEL_AVERAGE5X5,
-	KERNEL_EDGEDETECTOR,
-	KERNEL_BLOCKDIFFMAP,
-	KERNEL_EDGEDETECTORLOWFREQ,
-    KERNEL_COMPUTEBLOCKZEROINGORDER,
+	KERNEL_COMPUTEBLOCKZEROINGORDER,
 	KERNEL_COUNT,
 };
 

From 34af91ddd9028f258799e580e2e40d63764c7c58 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 31 May 2017 15:10:07 +0800
Subject: [PATCH 114/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

Conflicts:
	clguetzli/clguetzli.cl.cpp
---
 clguetzli/clguetzli.cl     | 17 +++++++++--------
 clguetzli/clguetzli.cl.cpp |  5 +++--
 clguetzli/ocl.cpp          | 21 +++++++++++++++++++--
 guetzli/processor.cc       |  2 +-
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 644a009a..cf7bca3e 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -387,23 +387,24 @@ __kernel void clEdgeDetectorLowFreqEx(
 	__global float *block_diff_ac,
     __global const float *r, __global const float *g, __global const float* b,
     __global const float *r2, __global const float* g2, __global const float *b2,
-    int xsize, int ysize, int step)
+    int xsize, int ysize, int step_)
 {
     const int res_x = get_global_id(0);
     const int res_y = get_global_id(1);
 
-    if (res_x < 8 / step) return;
+	const int step = 8;
+    if (res_x < step / step_) return;
 
     const int res_xsize = get_global_size(0);
     const int res_ysize = get_global_size(1);
 
-    int pos_x = (res_x - (8 / step)) * step;
-    int pos_y = res_y * step;
+    int x = (res_x - (step / step_)) * step_;
+    int y = res_y * step_;
 
-    if (pos_x + 8 >= xsize) return;
-    if (pos_y + 8 >= ysize) return;
+    if (x + step >= xsize) return;
+    if (y + step >= ysize) return;
 
-    int ix = pos_y * xsize + pos_x;
+    int ix = y * xsize + x;
 
     double diff[4][3];
     __global const float* blurred0[3] = { r, g, b };
@@ -423,7 +424,7 @@ __kernel void clEdgeDetectorLowFreqEx(
             ((blurred1[i][ix] - blurred0[i][ix]) +
             (blurred0[i][ix2] - blurred1[i][ix2]));
         ix2 = ix + 6 * xsize - 6;
-        diff[3][i] = pos_x < 8 ? 0 :
+        diff[3][i] = x < step ? 0 :
             ((blurred1[i][ix] - blurred0[i][ix]) +
             (blurred0[i][ix2] - blurred1[i][ix2]));
     }
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 0a05b038..b3203fe9 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -96,6 +96,7 @@ namespace guetzli
     double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
         double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
+/*
         if (g_checkOpenCL)
         {
             channel_info mayout_channel[3];
@@ -118,12 +119,12 @@ namespace guetzli
                 height_,
                 factor_x_);
 
-            if (err != err2)
+            if (fabs(err - err2) > 0.001)
             {
                 LogError("CompareBlock miss %s(%d) \r\n", __FUNCTION__, __LINE__);
             }
         }
-
+*/
         return err;
     }
 }
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 594adeec..73a8d022 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -203,7 +203,10 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
 	{
 		LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err));
 	}
-    if (mem && init)
+    if (!mem) return NULL;
+
+    // init memory
+    if (init)
     {
         err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL);
         if (CL_SUCCESS != err)
@@ -213,7 +216,21 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
         err = clFinish(this->commandQueue);
         if (CL_SUCCESS != err)
         {
-            LogError("Error: allocMem() clFinish return %s.\n", TranslateOpenCLError(err));
+            LogError("Error: allocMem() clEnqueueWriteBuffer/clFinish return %s.\n", TranslateOpenCLError(err));
+        }
+    }
+    else
+    {
+        cl_char cc = 0;
+        err = clEnqueueFillBuffer(this->commandQueue, mem, &cc, sizeof(cc), 0, s / sizeof(cc), 0, NULL, NULL);
+        if (CL_SUCCESS != err)
+        {
+            LogError("Error: allocMem() clEnqueueFillBuffer return %s.\n", TranslateOpenCLError(err));
+        }
+        err = clFinish(this->commandQueue);
+        if (CL_SUCCESS != err)
+        {
+            LogError("Error: allocMem() clEnqueueFillBuffer/clFinish return %s.\n", TranslateOpenCLError(err));
         }
     }
 
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 2c9811a9..1666d4fa 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -649,7 +649,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         }
         if (count > 0)
         {
-            LogError("CHK %s(%d) %d:%d\r\n", __FUNCTION__, __LINE__, count, check_size);
+            LogError("CHK %s(%d) %d:%d\r\n", "SelectFrequencyMasking", __LINE__, count, check_size);
         }
     }
 

From b47cb8ddc9c6ba699fd04e6b0c2e1ff0ca8bff61 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 31 May 2017 19:18:17 +0800
Subject: [PATCH 115/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0CUDA=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=EF=BC=8C=E8=AF=B7=E5=B0=8F=E5=BF=83=E6=9B=B4=E6=96=B0?=
 =?UTF-8?q?=EF=BC=8C=E6=B2=A1=E5=AE=89=E8=A3=85cuda=E4=BC=9A=E6=97=A0?=
 =?UTF-8?q?=E6=B3=95=E7=BC=96=E8=AF=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cu       |   5 ++
 clguetzli/clguetzli_test.cpp |  26 ++++--
 clguetzli/ocu.cpp            |  79 +++++++++++++++++
 clguetzli/ocu.h              |  19 +++++
 compile.bat                  | 160 +++++++++++++++++++++++++++++++++++
 guetzli.vcxproj              |  74 +++++++++-------
 guetzli.vcxproj.filters      |   9 ++
 7 files changed, 334 insertions(+), 38 deletions(-)
 create mode 100644 clguetzli/clguetzli.cu
 create mode 100644 clguetzli/ocu.cpp
 create mode 100644 clguetzli/ocu.h
 create mode 100644 compile.bat

diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
new file mode 100644
index 00000000..b76a81e7
--- /dev/null
+++ b/clguetzli/clguetzli.cu
@@ -0,0 +1,5 @@
+__global__ void clScaleImageEx(float *img, double scale)
+{
+    const int i = blockIdx.x;
+    img[i] *= scale;
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index bbfdb970..e98e6369 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -5,6 +5,7 @@
 #include "clguetzli_test.h"
 #include "clguetzli.h"
 #include "ocl.h"
+#include "ocu.h"
 
 #define FLOAT_COMPARE(a, b, c)  floatCompare((a), (b), (c), __FUNCTION__, __LINE__ )
 
@@ -404,19 +405,26 @@ void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 
 void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length)
 {
-  cl_int err = 0;
-  ocl_args_d_t &ocl = getOcl();
-  cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org);
+/*
+    ocu_args_d_t &ocu = getOcu();
+    CUdeviceptr m = ocu.allocMem(length * sizeof(float), result_org);
+    cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
+    cuMemFree(m);
+    return;
+*/
+    cl_int err = 0;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org);
 
-  clScaleImageEx(mem_result_org, length, scale);
+    clScaleImageEx(mem_result_org, length, scale);
 
-  cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err);
-  err = clFinish(ocl.commandQueue);
+    cl_float *r_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result_org, true, CL_MAP_READ, 0, length * sizeof(float), 0, NULL, NULL, &err);
+    err = clFinish(ocl.commandQueue);
 
-  FLOAT_COMPARE(r_r, result_cmp, length);
+    FLOAT_COMPARE(r_r, result_cmp, length);
 
-  clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL);
-  clReleaseMemObject(mem_result_org);
+    clEnqueueUnmapMemObject(ocl.commandQueue, mem_result_org, r_r, 0, NULL, NULL);
+    clReleaseMemObject(mem_result_org);
 }
 
 // strong todo
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
new file mode 100644
index 00000000..5b5da9b7
--- /dev/null
+++ b/clguetzli/ocu.cpp
@@ -0,0 +1,79 @@
+
+#include <cuda.h>
+#include "ocu.h"
+
+ocu_args_d_t& getOcu(void)
+{
+    static bool bInit = false;
+    static ocu_args_d_t ocu;
+
+    if (bInit == true) return ocu;
+
+    cuInit(0);
+
+    CUresult r;
+    CUcontext ctxt;
+    CUdevice dev = 0;
+
+    cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev);
+
+    char name[1024];
+    int proc_count = 0;
+    int thread_count = 0;
+    int cap_major = 0, cap_minor = 0;
+    cuDeviceGetName(name, sizeof(name), dev);
+    cuDeviceGetAttribute(&cap_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+    cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+    cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+    cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+    LogError("CUDA Adapter:%s Ver%d.%d (%d x %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
+
+    CUmodule mod;
+
+    char* source = nullptr;
+    size_t src_size = 0;
+    ReadSourceFromFile("clguetzli/clguetzli.cu.ptx30", &source, &src_size);
+
+    CUjit_option jit_options[2];
+    void *jit_optvals[2];
+    jit_options[0] = CU_JIT_CACHE_MODE;
+    jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA;
+    cuModuleLoadDataEx(&mod, source, 1, jit_options, jit_optvals);
+
+    delete[] source;
+
+    cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
+
+    cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED);
+    cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE);
+
+    cuStreamCreate(&ocu.stream, 0);
+
+    return ocu;
+}
+
+ocu_args_d_t::ocu_args_d_t()
+{
+
+}
+
+ocu_args_d_t::~ocu_args_d_t()
+{
+
+}
+
+CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init)
+{
+    CUdeviceptr mem;
+    cuMemAlloc(&mem, s);
+    if (init)
+    {
+        cuMemcpyHtoDAsync(mem, init, s, this->stream);
+    }
+    else
+    {
+        cuMemsetD8(mem, 0, s);
+    }
+
+    return mem;
+}
\ No newline at end of file
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
new file mode 100644
index 00000000..f33c856f
--- /dev/null
+++ b/clguetzli/ocu.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <cuda.h>
+#include "ocl.h"
+
+struct ocu_args_d_t;
+
+ocu_args_d_t& getOcu(void);
+
+struct ocu_args_d_t
+{
+    ocu_args_d_t();
+    ~ocu_args_d_t();
+
+    CUdeviceptr allocMem(size_t s, const void *init);
+
+    CUfunction  kernel[KERNEL_COUNT];
+    CUstream    stream;
+};
\ No newline at end of file
diff --git a/compile.bat b/compile.bat
new file mode 100644
index 00000000..b27c9e49
--- /dev/null
+++ b/compile.bat
@@ -0,0 +1,160 @@
+@if "%1" == "" goto start
+@setlocal
+@set userinput=%1
+@if not "%1"=="store" @if not "%1"=="8.1" @if not "%userinput:~0,3%"=="10." goto usage
+@endlocal
+
+:start
+@call :GetVSCommonToolsDir
+@if "%VS140COMNTOOLS%"=="" goto error_no_VS140COMNTOOLSDIR
+
+@call "%VS140COMNTOOLS%VCVarsQueryRegistry.bat" No32bit 64bit %1 %2
+
+@if "%VSINSTALLDIR%"=="" goto error_no_VSINSTALLDIR
+@if "%VCINSTALLDIR%"=="" goto error_no_VCINSTALLDIR
+@if "%FrameworkDir64%"=="" goto error_no_FrameworkDIR64
+@if "%FrameworkVersion64%"=="" goto error_no_FrameworkVer64
+@if "%Framework40Version%"=="" goto error_no_Framework40Version
+
+@set FrameworkDir=%FrameworkDir64%
+@set FrameworkVersion=%FrameworkVersion64%
+
+@if not "%WindowsSDK_ExecutablePath_x64%" == "" @set PATH=%WindowsSDK_ExecutablePath_x64%;%PATH%
+
+@rem
+@rem Set Windows SDK include/lib path
+@rem
+@if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH%
+@if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\%WindowsSDKVersion%shared;%WindowsSdkDir%include\%WindowsSDKVersion%um;%WindowsSdkDir%include\%WindowsSDKVersion%winrt;%INCLUDE%
+@if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\%WindowsSDKLibVersion%um\x64;%LIB%
+@if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH%
+
+@REM Set NETFXSDK include/lib path
+@if not "%NETFXSDKDir%" == "" @set INCLUDE=%NETFXSDKDir%include\um;%INCLUDE%
+@if not "%NETFXSDKDir%" == "" @set LIB=%NETFXSDKDir%lib\um\x64;%LIB%
+
+@rem
+@rem Set UniversalCRT include/lib path, the default is the latest installed version.
+@rem
+@if not "%UCRTVersion%" == "" @set INCLUDE=%UniversalCRTSdkDir%include\%UCRTVersion%\ucrt;%INCLUDE%
+@if not "%UCRTVersion%" == "" @set LIB=%UniversalCRTSdkDir%lib\%UCRTVersion%\ucrt\x64;%LIB%
+
+@rem PATH
+@rem ----
+@if exist "%VSINSTALLDIR%Team Tools\Performance Tools\x64" @set PATH=%VSINSTALLDIR%Team Tools\Performance Tools\x64;%VSINSTALLDIR%Team Tools\Performance Tools;%PATH%
+
+@if exist "%ProgramFiles%\HTML Help Workshop" set PATH=%ProgramFiles%\HTML Help Workshop;%PATH%
+@if exist "%ProgramFiles(x86)%\HTML Help Workshop" set PATH=%ProgramFiles(x86)%\HTML Help Workshop;%PATH%
+@if exist "%VSINSTALLDIR%Common7\Tools" set PATH=%VSINSTALLDIR%Common7\Tools;%PATH%
+@if exist "%VSINSTALLDIR%Common7\IDE" set PATH=%VSINSTALLDIR%Common7\IDE;%PATH%
+@if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH%
+@if exist "%FrameworkDir%\%Framework40Version%" set PATH=%FrameworkDir%\%Framework40Version%;%PATH%
+@if exist "%FrameworkDir%\%FrameworkVersion%" set PATH=%FrameworkDir%\%FrameworkVersion%;%PATH%
+@if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH%
+
+@rem Add path to MSBuild Binaries
+@if exist "%ProgramFiles%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles%\MSBuild\14.0\bin\amd64;%PATH%
+@if exist "%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64;%PATH%
+
+@if exist "%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow" @set PATH=%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow;%PATH%
+
+@rem INCLUDE
+@rem -------
+@if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE%
+@if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE%
+
+@rem LIB
+@rem ---
+@if "%1" == "store" goto setstorelib
+@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB%
+@if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB%
+@goto setlibpath
+:setstorelib
+@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIB=%VCINSTALLDIR%LIB\store\amd64;%LIB%
+
+:setlibpath
+@rem LIBPATH
+@rem -------
+@if "%1" == "store" goto setstorelibpath
+@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH%
+@if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH%
+@goto appendlibpath
+:setstorelibpath
+@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIBPATH=%VCINSTALLDIR%LIB\store\amd64;%VCINSTALLDIR%LIB\store\references;%LIBPATH%
+:appendlibpath
+@if exist "%FrameworkDir%\%Framework40Version%" set LIBPATH=%FrameworkDir%\%Framework40Version%;%LIBPATH%
+@if exist "%FrameworkDir%\%FrameworkVersion%" set LIBPATH=%FrameworkDir%\%FrameworkVersion%;%LIBPATH%
+
+@set Platform=X64
+@set CommandPromptType=Native
+
+@goto end
+
+@REM -----------------------------------------------------------------------
+:GetVSCommonToolsDir
+@set VS140COMNTOOLS=
+@call :GetVSCommonToolsDirHelper32 HKLM > nul 2>&1
+@if errorlevel 1 call :GetVSCommonToolsDirHelper32 HKCU > nul 2>&1
+@if errorlevel 1 call :GetVSCommonToolsDirHelper64  HKLM > nul 2>&1
+@if errorlevel 1 call :GetVSCommonToolsDirHelper64  HKCU > nul 2>&1
+@exit /B 0
+
+:GetVSCommonToolsDirHelper32
+@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO (
+	@if "%%i"=="14.0" (
+		@SET VS140COMNTOOLS=%%k
+	)
+)
+@if "%VS140COMNTOOLS%"=="" exit /B 1
+@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\
+@exit /B 0
+
+:GetVSCommonToolsDirHelper64
+@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Wow6432Node\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO (
+	@if "%%i"=="14.0" (
+		@SET VS140COMNTOOLS=%%k
+	)
+)
+@if "%VS140COMNTOOLS%"=="" exit /B 1
+@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\
+@exit /B 0
+
+@REM -----------------------------------------------------------------------
+:error_no_VS140COMNTOOLSDIR
+@echo ERROR: Cannot determine the location of the VS Common Tools folder.
+@goto end
+
+:error_no_VSINSTALLDIR
+@echo ERROR: Cannot determine the location of the VS installation.
+@goto end
+
+:error_no_VCINSTALLDIR
+@echo ERROR: Cannot determine the location of the VC installation.
+@goto end
+
+:error_no_FrameworkDIR64
+@echo ERROR: Cannot determine the location of the .NET Framework 64bit installation.
+@goto end
+
+:error_no_FrameworkVer64
+@echo ERROR: Cannot determine the version of the .NET Framework 64bit installation.
+@goto end
+
+:error_no_Framework40Version
+@echo ERROR: Cannot determine the .NET Framework 4.0 version.
+@goto end
+
+:usage
+echo Error in script usage. The correct usage is:
+echo     %0
+echo   or
+echo     %0 store
+echo   or
+echo     %0 10.0.10240.0
+echo   or
+echo     %0 store 10.0.10240.0
+
+:end
+
+
+nvcc -Xcompiler "/wd 4819" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 42a13971..7f3b26ca 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -99,24 +99,24 @@
     <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>Full</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
       <PreprocessorDefinitions>ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Link>
     <CustomBuild>
       <Command>"$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current            -bo="           "</Command>
@@ -128,16 +128,19 @@
       <LinkObjects>false</LinkObjects>
     </CustomBuild>
     <Intel_OpenCL_Build_Rules />
+    <PostBuildEvent />
+    <PreBuildEvent />
+    <PreBuildEvent />
+    <PreBuildEvent />
     <PostBuildEvent>
-      <Command>
-      </Command>
+      <Command>compile.bat</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
@@ -149,38 +152,40 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>shlwapi.lib;OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x86</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
     </Link>
+    <PostBuildEvent>
+      <Command>compile.bat</Command>
+    </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x64</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
-      <Command>
-      </Command>
+      <Command>compile.bat</Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(INTELOCLSDKROOT)include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -188,11 +193,14 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>shlwapi.lib;OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(INTELOCLSDKROOT)lib\x86</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
     </Link>
+    <PostBuildEvent>
+      <Command>compile.bat</Command>
+    </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="clguetzli\clbutter_comparator.h" />
@@ -200,6 +208,7 @@
     <ClInclude Include="clguetzli\clguetzli.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\ocl.h" />
+    <ClInclude Include="clguetzli\ocu.h" />
     <ClInclude Include="clguetzli\utils.h" />
     <ClInclude Include="guetzli\butteraugli_comparator.h" />
     <ClInclude Include="guetzli\color_transform.h" />
@@ -297,6 +306,7 @@
     <ClCompile Include="clguetzli\clguetzli.cpp" />
     <ClCompile Include="clguetzli\clguetzli_test.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
+    <ClCompile Include="clguetzli\ocu.cpp" />
     <ClCompile Include="clguetzli\utils.cpp" />
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
     <ClCompile Include="guetzli\dct_double.cc" />
@@ -388,6 +398,12 @@
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
     </Intel_OpenCL_Build_Rules>
+    <None Include="clguetzli\clguetzli.cu">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)clguetzli\compile.bat</Command>
+    </None>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />
     <None Include="third_party\zlib\match32.asm" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index fc895c38..bfdedbe0 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -312,6 +312,9 @@
     <ClInclude Include="clguetzli\clguetzli.cl.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\ocu.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -581,6 +584,9 @@
     <ClCompile Include="clguetzli\clbutter_comparator.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\ocu.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">
@@ -598,6 +604,9 @@
     <None Include="third_party\zlib\zlib.def">
       <Filter>third_party\zlib</Filter>
     </None>
+    <None Include="clguetzli\clguetzli.cu">
+      <Filter>clguetzli</Filter>
+    </None>
   </ItemGroup>
   <ItemGroup>
     <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">

From 99631b87fea40e3e5faf6ef2dac39845b643a98c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 1 Jun 2017 10:00:43 +0800
Subject: [PATCH 116/189] support cuda opt

---
 clguetzli/clbutter_comparator.cpp |  9 ++++++++-
 clguetzli/clguetzli.cpp           | 22 ++++++++++++++++++++++
 clguetzli/clguetzli.cu            | 21 ++++++++++++++++++++-
 clguetzli/clguetzli.h             |  3 +++
 guetzli/guetzli.cc                |  4 ++++
 5 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index c6b4ca0b..fd31632d 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -229,7 +229,14 @@ namespace butteraugli
             result_org = *result;
         }
 
-        _ScaleImage(scale, result);
+        if (g_useCuda)
+        {
+            cuScaleImage(result->data(), result->size(), scale);
+        }
+        else
+        {
+            _ScaleImage(scale, result);
+        }
 
         if (g_checkOpenCL && result->size() > 64)
         {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index b58f7dc8..77aa68bb 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -2,8 +2,10 @@
 #include <algorithm>
 #include <vector>
 #include "clguetzli.h"
+#include "ocu.h"
 
 extern bool g_useOpenCL = false;
+extern bool g_useCuda = false;
 extern bool g_checkOpenCL = false;
 
 ocl_args_d_t& getOcl(void)
@@ -1225,3 +1227,23 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 	clReleaseMemObject(blurred);
 }
 
+void cuScaleImage(float *img, size_t length, double scale)
+{
+	ocu_args_d_t &ocu = getOcu();
+	CUdeviceptr m = ocu.allocMem(length * sizeof(float), img);
+
+	void *args[2] = { &m, &scale};
+
+	CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
+                   1, 1, 1,
+                   length, 1, 1,
+                   0,
+                   ocu.stream, args, NULL);
+
+    r = cuStreamSynchronize(ocu.stream);
+
+    cuMemcpyDtoH(img, m, length * sizeof(float));
+
+	cuMemFree(m);
+	return;
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index b76a81e7..3ed6d05e 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1,5 +1,24 @@
+#ifdef __CUDACC__
+//#ifdef __OPENCL_VERSION__
+__device__ int get_global_id(int dim)
+{
+    switch (dim)
+    {
+    case 0:
+        return threadIdx.x;
+    case 1:
+        return threadIdx.y;
+    case 2:
+        return threadIdx.z;
+    default:
+        return threadIdx.x;
+    }
+}
+#endif
+
+
 __global__ void clScaleImageEx(float *img, double scale)
 {
-    const int i = blockIdx.x;
+    const int i = get_global_id(0);
     img[i] *= scale;
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index b5997fcd..760677fd 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -7,6 +7,7 @@
 #include "clguetzli.cl.h"
 
 extern bool g_useOpenCL;
+extern bool g_useCuda;
 extern bool g_checkOpenCL;
 
 void clOpsinDynamicsImage(
@@ -134,6 +135,8 @@ void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int
 
 void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
 
+void cuScaleImage(float *img, size_t length, double scale);
+
 class guetzli::OutputImage;
 
 namespace guetzli {
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 40544d90..d8937978 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -227,6 +227,7 @@ void Usage() {
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
 	  "  --opencl     - Use OpenCL\n"
+	  "  --cuda       - Use CUDA\n"
       "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
@@ -262,6 +263,9 @@ int main(int argc, char** argv) {
 	else if (!strcmp(argv[opt_idx], "--opencl")) {
 		g_useOpenCL = true;
 	}
+	else if (!strcmp(argv[opt_idx], "--cuda")) {
+		g_useCuda = true;
+	}
 	else if (!strcmp(argv[opt_idx], "--checkcl")) {
 		g_checkOpenCL = true;
 	}

From ef025dab8c43b3200cd616508efdd8cb817a73a2 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 1 Jun 2017 14:37:37 +0800
Subject: [PATCH 117/189] =?UTF-8?q?=E8=BF=90=E8=A1=8C=E6=9C=9F=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91.cu?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl.cpp   |  2 +-
 clguetzli/clguetzli.cl.h     | 15 ++++++++--
 clguetzli/clguetzli.cpp      |  6 ++--
 clguetzli/clguetzli.cu       |  8 ++++--
 clguetzli/clguetzli_test.cpp |  7 -----
 clguetzli/ocu.cpp            | 54 ++++++++++++++++++++++++++----------
 clguetzli/ocu.h              |  3 ++
 compile.bat                  |  2 +-
 guetzli.vcxproj              | 18 ++++--------
 9 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index b3203fe9..cafb0bf7 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -22,7 +22,7 @@ void set_global_size(int dim, int size){
     g_sizevec[dim] = size;
 }
 
-#define __opencl
+#define __checkcl
 #define abs(exper)    fabs((exper))
 #include "clguetzli.h"
 #include "clguetzli.cl"
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 35b4ed3c..cf4d9212 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -2,6 +2,7 @@
 #define __CLGUETZLI_CL_H__
 
 #ifdef __cplusplus
+#ifndef __CUDACC__
     #define __kernel
     #define __private
     #define __global
@@ -14,7 +15,7 @@
     void set_global_id(int dim, int id);
     void set_global_size(int dim, int size);
 
-    #ifdef __opencl
+    #ifdef __checkcl
         typedef union ocl_channels_t
         {
             struct
@@ -49,7 +50,10 @@
             };
         }ocl_channels;
     #endif
-#else /*__cplusplus*/
+#endif
+#endif /*__cplusplus*/
+
+#ifdef __OPENCL_VERSION__
     typedef union ocl_channels_t
     {
         struct
@@ -65,7 +69,12 @@
         };
     }ocl_channels;
 
-#endif /*__cplusplus*/
+#endif /*__OPENCL_VERSION__*/
+
+#ifdef __CUDACC__
+    #define __global
+    typedef unsigned short ushort;
+#endif /*__CUDACC__*/
 
     typedef short coeff_t;
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 77aa68bb..0606793b 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1,7 +1,7 @@
+#include "clguetzli.h"
 #include <math.h>
 #include <algorithm>
 #include <vector>
-#include "clguetzli.h"
 #include "ocu.h"
 
 extern bool g_useOpenCL = false;
@@ -1235,14 +1235,14 @@ void cuScaleImage(float *img, size_t length, double scale)
 	void *args[2] = { &m, &scale};
 
 	CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
+        length, 1, 1,
                    1, 1, 1,
-                   length, 1, 1,
                    0,
                    ocu.stream, args, NULL);
 
     r = cuStreamSynchronize(ocu.stream);
 
-    cuMemcpyDtoH(img, m, length * sizeof(float));
+    r = cuMemcpyDtoH(img, m, length * sizeof(float));
 
 	cuMemFree(m);
 	return;
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index 3ed6d05e..17b65143 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1,3 +1,5 @@
+#include "clguetzli\clguetzli.cl.h"
+
 #ifdef __CUDACC__
 //#ifdef __OPENCL_VERSION__
 __device__ int get_global_id(int dim)
@@ -17,8 +19,8 @@ __device__ int get_global_id(int dim)
 #endif
 
 
-__global__ void clScaleImageEx(float *img, double scale)
+extern "C" __global__ void clScaleImageEx(float * img, double scale)
 {
     const int i = get_global_id(0);
-    img[i] *= scale;
-}
\ No newline at end of file
+    img[i] = 0.0001;
+}
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index e98e6369..15c1317b 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -405,13 +405,6 @@ void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 
 void tclScaleImage(double scale, const float *result_org, const float *result_cmp, size_t length)
 {
-/*
-    ocu_args_d_t &ocu = getOcu();
-    CUdeviceptr m = ocu.allocMem(length * sizeof(float), result_org);
-    cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
-    cuMemFree(m);
-    return;
-*/
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
     cl_mem mem_result_org = ocl.allocMem(length * sizeof(float), result_org);
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 5b5da9b7..7846afcb 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -1,5 +1,5 @@
-
 #include <cuda.h>
+#include <nvrtc.h>
 #include "ocu.h"
 
 ocu_args_d_t& getOcu(void)
@@ -9,13 +9,12 @@ ocu_args_d_t& getOcu(void)
 
     if (bInit == true) return ocu;
 
-    cuInit(0);
-
-    CUresult r;
-    CUcontext ctxt;
+    CUresult r = cuInit(0);
     CUdevice dev = 0;
+    CUcontext ctxt;
+    CUstream  stream;
 
-    cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev);
+    r = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev);
 
     char name[1024];
     int proc_count = 0;
@@ -26,28 +25,53 @@ ocu_args_d_t& getOcu(void)
     cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
     cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
     cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
-    LogError("CUDA Adapter:%s Ver%d.%d (%d x %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
-
-    CUmodule mod;
+    LogError("CUDA Adapter:%s Ver%d.%d MP %d Core %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
 
     char* source = nullptr;
     size_t src_size = 0;
-    ReadSourceFromFile("clguetzli/clguetzli.cu.ptx30", &source, &src_size);
+    ReadSourceFromFile("clguetzli/clguetzli.cu", &source, &src_size);
+
+    nvrtcProgram prog;
+    const char *opts[] = { "-arch=compute_30", "--fmad=false" };
+    nvrtcCreateProgram(&prog, source, "clguetzli.cu", 0, NULL, NULL);
+    nvrtcCompileProgram(prog, 2, opts);
 
+    // Obtain compilation log from the program.
+    size_t logSize = 0;
+    nvrtcGetProgramLogSize(prog, &logSize);
+    char *log = new char[logSize];
+    nvrtcGetProgramLog(prog, log);
+
+    // Obtain PTX from the program.
+    size_t ptxSize = 0;
+    nvrtcGetPTXSize(prog, &ptxSize);
+    char *ptx = new char[ptxSize];
+    nvrtcGetPTX(prog, ptx);
+
+    LogError("BuildInfo:\r\n%s\r\n", log);
+
+    CUmodule mod;
     CUjit_option jit_options[2];
     void *jit_optvals[2];
     jit_options[0] = CU_JIT_CACHE_MODE;
     jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA;
-    cuModuleLoadDataEx(&mod, source, 1, jit_options, jit_optvals);
+    r = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals);
 
     delete[] source;
+    delete[] log;
+    delete[] ptx;
 
-    cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
 
     cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED);
     cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE);
 
-    cuStreamCreate(&ocu.stream, 0);
+    cuStreamCreate(&stream, 0);
+
+    ocu.dev = dev;
+    ocu.stream = stream;
+    ocu.mod = mod;
+    ocu.ctxt = ctxt;
 
     return ocu;
 }
@@ -59,7 +83,9 @@ ocu_args_d_t::ocu_args_d_t()
 
 ocu_args_d_t::~ocu_args_d_t()
 {
-
+    cuModuleUnload(mod);
+    cuCtxDestroy(ctxt);
+    cuStreamDestroy(stream);
 }
 
 CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init)
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index f33c856f..0ab97945 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -16,4 +16,7 @@ struct ocu_args_d_t
 
     CUfunction  kernel[KERNEL_COUNT];
     CUstream    stream;
+    CUmodule    mod;
+    CUcontext   ctxt;
+    CUdevice    dev;
 };
\ No newline at end of file
diff --git a/compile.bat b/compile.bat
index b27c9e49..05cdd472 100644
--- a/compile.bat
+++ b/compile.bat
@@ -157,4 +157,4 @@ echo     %0 store 10.0.10240.0
 :end
 
 
-nvcc -Xcompiler "/wd 4819" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu
\ No newline at end of file
+nvcc -Xcompiler "/wd 4819" -I"./" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 7f3b26ca..e31abaff 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -114,7 +114,7 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Link>
@@ -132,9 +132,7 @@
     <PreBuildEvent />
     <PreBuildEvent />
     <PreBuildEvent />
-    <PostBuildEvent>
-      <Command>compile.bat</Command>
-    </PostBuildEvent>
+    <PostBuildEvent />
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
@@ -173,13 +171,11 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Link>
-    <PostBuildEvent>
-      <Command>compile.bat</Command>
-    </PostBuildEvent>
+    <PostBuildEvent />
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -193,14 +189,12 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
     </Link>
-    <PostBuildEvent>
-      <Command>compile.bat</Command>
-    </PostBuildEvent>
+    <PostBuildEvent />
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="clguetzli\clbutter_comparator.h" />

From a8bcf1f2df768730ce378f239da0ffb22cdc2512 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 1 Jun 2017 19:24:49 +0800
Subject: [PATCH 118/189] =?UTF-8?q?=E5=85=BC=E5=AE=B9CUDA=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=EF=BC=8C=E7=BC=96=E8=AF=91=E5=99=A8=E8=AF=AD=E6=B3=95?=
 =?UTF-8?q?=E6=A3=80=E6=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl   | 141 ++++++++++++++++++++-------------------
 clguetzli/clguetzli.cl.h |  19 +++++-
 clguetzli/clguetzli.cpp  |   4 +-
 clguetzli/clguetzli.cu   |  11 +--
 clguetzli/ocu.cpp        |  19 ++++++
 compile.bat              |   3 +-
 guetzli.vcxproj          |  18 +++--
 guetzli.vcxproj.filters  |   8 ++-
 8 files changed, 132 insertions(+), 91 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index cf7bca3e..ec04630d 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -22,21 +22,21 @@ typedef struct __IntFloatPairList
     IntFloatPair *pData;
 }IntFloatPairList;
 
-void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
-double InterpolateClampNegative(__global const double *array, int size, double sx);
-void   XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+__device__ void   XybToVals(double x, double y, double z, double *valx, double *valy, double *valz);
+__device__ double InterpolateClampNegative(__global const double *array, int size, double sx);
+__device__ void   XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
                                        double r1, double g1, double b1,
                                        double factor, double res[3]);
-double DotProduct(__global const float u[3], const double v[3]);
-void   OpsinAbsorbance(const double in[3], double out[3]);
-void   RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz);
-double Gamma(double v);
-void   ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
+__device__ double DotProduct(__global const float u[3], const double v[3]);
+__device__ void   OpsinAbsorbance(const double in[3], double out[3]);
+__device__ void   RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz);
+__device__ double Gamma(double v);
+__device__ void   ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
     __private double xyb1[3 * kBlockSize],
     double diff_xyb_dc[3],
     double diff_xyb_ac[3],
     double diff_xyb_edge_dc[3]);
-void Butteraugli8x8CornerEdgeDetectorDiff(
+__device__ void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_x,
     int pos_y,
     int xsize,
@@ -45,9 +45,9 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     __global const float *r2, __global const float* g2, __global const float *b2,
     double* diff_xyb);
 
-int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order);
+__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order);
 
-double CompareBlockFactor(const channel_info mayout_channel[3],
+__device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                         const coeff_t* candidate_block,
                         const int block_x,
                         const int block_y,
@@ -57,11 +57,11 @@ double CompareBlockFactor(const channel_info mayout_channel[3],
                         const int image_height,
                         const int factor);
 
-void floatcopy(float *dst, const float *src, int size);
-void coeffcopy(coeff_t *dst, const coeff_t *src, int size);
-void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size);
-int list_erase(IntFloatPairList* list, int idx);
-int list_push_back(IntFloatPairList* list, int i, float f);
+__device__ void floatcopy(float *dst, const float *src, int size);
+__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size);
+__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size);
+__device__ int list_erase(IntFloatPairList* list, int idx);
+__device__ int list_push_back(IntFloatPairList* list, int i, float f);
 
 __kernel void clConvolutionEx(
 	__global float* result,
@@ -842,7 +842,7 @@ __kernel void clComputeBlockZeroingOrderEx(
     }
 }
 
-void Butteraugli8x8CornerEdgeDetectorDiff(
+__device__ void Butteraugli8x8CornerEdgeDetectorDiff(
     int pos_x,
     int pos_y,
     int xsize,
@@ -898,11 +898,11 @@ void Butteraugli8x8CornerEdgeDetectorDiff(
     }
 }
 
-double DotProduct(__global const float u[3], const double v[3]) {
+__device__ double DotProduct(__global const float u[3], const double v[3]) {
     return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
 }
 
-double Interpolate(__constant const double *array, const int size, const double sx) {
+__device__ double Interpolate(__constant_ex const double *array, const int size, const double sx) {
     double ix = fabs(sx);
 
     int baseix = (int)(ix);
@@ -971,7 +971,7 @@ __constant double XybToVals_lut_y[21] = {
     XybToVals_off_y + 19 * XybToVals_inc_y,
 };
 
-void XybToVals(
+__device__ void XybToVals(
     double x, double y, double z,
     double *valx, double *valy, double *valz)
 {
@@ -1009,7 +1009,7 @@ __constant double XybLowFreqToVals_lut[21] = {
     20 * XybLowFreqToVals_inc,
 };
 
-void XybLowFreqToVals(double x, double y, double z,
+__device__ void XybLowFreqToVals(double x, double y, double z,
     double *valx, double *valy, double *valz) {
     const double xmul = 6.64482198135;
     const double ymul = 0.837846224276;
@@ -1022,7 +1022,7 @@ void XybLowFreqToVals(double x, double y, double z,
     *valy = Interpolate(&XybLowFreqToVals_lut[0], 21, y * ymul);
 }
 
-double InterpolateClampNegative(__global const double *array,
+__device__ double InterpolateClampNegative(__global const double *array,
 	int size, double sx) {
 	if (sx < 0) {
 		sx = 0;
@@ -1041,7 +1041,7 @@ double InterpolateClampNegative(__global const double *array,
 	return res;
 }
 
-void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
+__device__ void XybDiffLowFreqSquaredAccumulate(double r0, double g0, double b0,
 	double r1, double g1, double b1,
 	double factor, double res[3]) {
 	double valx0, valy0, valz0;
@@ -1072,7 +1072,7 @@ typedef struct __Complex
 }Complex;
 
 __constant double kSqrtHalf = 0.70710678118654752440084436210484903;
-void RealFFT8(const double* in, Complex* out) {
+__device__ void RealFFT8(const double* in, Complex* out) {
 	double t1, t2, t3, t5, t6, t7, t8;
 	t8 = in[6];
 	t5 = in[2] - t8;
@@ -1145,7 +1145,7 @@ void RealFFT8(const double* in, Complex* out) {
 	out[6] = tmp;
 }
 
-void TransposeBlock(Complex data[kBlockSize]) {
+__device__ void TransposeBlock(Complex data[kBlockSize]) {
 	for (int i = 0; i < kBlockEdge; i++) {
 		for (int j = 0; j < i; j++) {
 			Complex tmp = data[kBlockEdge * i + j];
@@ -1156,7 +1156,7 @@ void TransposeBlock(Complex data[kBlockSize]) {
 }
 
 //  D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements.
-inline void FFT4(Complex* a) {
+__device__ inline void FFT4(Complex* a) {
 	double t1, t2, t3, t4, t5, t6, t7, t8;
 	t5 = a[2].real;
 	t1 = a[0].real - t5;
@@ -1186,7 +1186,7 @@ inline void FFT4(Complex* a) {
 }
 
 //  D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements.
-void FFT8(Complex* a) {
+__device__ void FFT8(Complex* a) {
 	const double kSqrtHalf = 0.70710678118654752440084436210484903;
 	double t1, t2, t3, t4, t5, t6, t7, t8;
 
@@ -1280,11 +1280,11 @@ void FFT8(Complex* a) {
 	a[6] = tmp;
 }
 
-double abssq(const Complex c) {
+__device__ double abssq(const Complex c) {
 	return c.real * c.real + c.imag * c.imag;
 }
 
-void ButteraugliFFTSquared(__private double block[kBlockSize]) {
+__device__ void ButteraugliFFTSquared(__private double block[kBlockSize]) {
 	double global_mul = 0.000064;
 	Complex block_c[kBlockSize];
 
@@ -1309,7 +1309,7 @@ void ButteraugliFFTSquared(__private double block[kBlockSize]) {
 	}
 }
 
-double RemoveRangeAroundZero(double v, double range) {
+__device__ double RemoveRangeAroundZero(double v, double range) {
 	if (v >= -range && v < range) {
 		return 0;
 	}
@@ -1390,7 +1390,7 @@ __constant double csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
 // Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
 // 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
 // diff on the edges to diff_xyb_edge_dc.
-void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
+__device__ void ButteraugliBlockDiff(__private double xyb0[3 * kBlockSize],
 	__private double xyb1[3 * kBlockSize],
 	double diff_xyb_dc[3],
 	double diff_xyb_ac[3],
@@ -1488,14 +1488,14 @@ __constant static float g_mix[12] = {
     10.6524069248,
 };
 
-void OpsinAbsorbance(const double in[3], double out[3])
+__device__ void OpsinAbsorbance(const double in[3], double out[3])
 {
     out[0] = g_mix[0] * in[0] + g_mix[1] * in[1] + g_mix[2] * in[2] + g_mix[3];
     out[1] = g_mix[4] * in[0] + g_mix[5] * in[1] + g_mix[6] * in[2] + g_mix[7];
     out[2] = g_mix[8] * in[0] + g_mix[9] * in[1] + g_mix[10] * in[2] + g_mix[11];
 }
 
-double EvaluatePolynomial(const double x, __constant const double *coefficients, int n)
+__device__ double EvaluatePolynomial(const double x, __constant_ex const double *coefficients, int n)
 {
     double b1 = 0.0;
     double b2 = 0.0;
@@ -1526,7 +1526,7 @@ static __constant double g_gamma_q[5 + 1] = {
     4.711532733641639, 0.899112889751053, 0.035662329617191,
 };
 
-double Gamma(double v)
+__device__ double Gamma(double v)
 {
     const double min_value = 0.770000000000000;
     const double max_value = 274.579999999999984;
@@ -1539,7 +1539,7 @@ double Gamma(double v)
     return (float)(yp / yq);
 }
 
-void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
+__device__ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *valz)
 {
     const double a0 = 1.01611726948;
     const double a1 = 0.982482243696;
@@ -1552,7 +1552,7 @@ void RgbToXyb(double r, double g, double b, double *valx, double *valy, double *
 
 // chrisk todo
 // return size
-int list_push_back(IntFloatPairList* list, int i, float f)
+__device__ int list_push_back(IntFloatPairList* list, int i, float f)
 {
 	list->pData[list->size].idx = i;
 	list->pData[list->size].err = f;
@@ -1561,7 +1561,7 @@ int list_push_back(IntFloatPairList* list, int i, float f)
 
 // chrisk todo
 // remove idx and return size
-int list_erase(IntFloatPairList* list, int idx)
+__device__ int list_erase(IntFloatPairList* list, int idx)
 {
 	for (int i = idx; i < list->size - 1; i++)
 	{
@@ -1572,7 +1572,7 @@ int list_erase(IntFloatPairList* list, int idx)
 }
 
 // chrisk todo
-int SortInputOrder(DCTScoreData* input_order, int size)
+__device__  int SortInputOrder(DCTScoreData* input_order, int size)
 {
 	int i, j;
 	DCTScoreData tmp;
@@ -1984,7 +1984,7 @@ __constant static float bias[192] = {
 
 // chrisk todo
 // return the count of Non-zero item
-int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
+__device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
 {
 	int size = 0;
 	for (int c = 0; c < 3; ++c) {
@@ -2011,7 +2011,7 @@ __constant static int kIDCTMatrix[kDCTBlockSize] = {
 };
 
 // Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]}
-void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
+__device__ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
 	int tmp0, tmp1, tmp2, tmp3, tmp4;
 
 	tmp1 = kIDCTMatrix[0] * in[0];
@@ -2109,7 +2109,7 @@ void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
 	out[7] -= tmp1;
 }
 
-void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
+__device__ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
 {
 	coeff_t colidcts[kDCTBlockSize];
 	const int kColScale = 11;
@@ -2136,7 +2136,7 @@ void CoeffToIDCT(__private const coeff_t block[8*8], uchar out[8*8])
 	}
 }
 
-void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
+__device__ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
 {
     const int block_x = 0;
     const int block_y = 0;
@@ -2154,7 +2154,7 @@ void IDCTToPixel8x8(const uchar idct[8 * 8], ushort pixels_[8 * 8])
     }
 }
 
-void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+__device__ void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
     // Fill in the 10x10 pixel area in the subsampled image that will be the
     // basis of the upsampling. This area is enough to hold the 3x3 kernel of
@@ -2223,7 +2223,7 @@ void IDCTToPixel16x16(const uchar idct[8 * 8], ushort pixels_out[16 * 16], __glo
 }
 
 // out = [YUVYUV....YUVYUV]
-void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/)
+__device__ void PixelToYUV(ushort pixels_[8 * 8], uchar out[8 * 8], int xsize/* = 8*/, int ysize/* = 8*/)
 {
     const int stride = 3;
 
@@ -2423,9 +2423,10 @@ __constant static uchar kRangeLimitLut[4 * 256] = {
 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 };
 
-void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/)
+__device__ void YUVToRGB(__private uchar pixelBlock[3*8*8], int size /*= 8 * 8*/)
 {
-	__constant uchar* kRangeLimit = kRangeLimitLut + 384;
+    __constant_ex uchar* kRangeLimit = kRangeLimitLut + 384;
+
 	for (int i = 0; i < size; i++)
 	{
 		uchar *pixel = &pixelBlock[i * 3];
@@ -2698,12 +2699,12 @@ __constant static double kSrgb8ToLinearTable[256] = {
 	255.000000,
 };
 
-
-void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/)
+__device__ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, int xsize/* = 8*/, int ysize/* = 8*/, int inside_x/* = 8*/, int inside_y/* = 8*/)
 {
     YUVToRGB(yuv, xsize * ysize);
 
-    const __constant double* lut = kSrgb8ToLinearTable;
+#define lut kSrgb8ToLinearTable
+//    const __constant double* lut = kSrgb8ToLinearTable;
 
     for (int i = 0; i < xsize * ysize; i++)
     {
@@ -2731,11 +2732,11 @@ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, float* b, in
             b[y * xsize + x] = b[idx];
         }
     }
+#undef lut
 }
 
-
 // chrisk todo
-void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
+__device__ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
 {
 	uchar idct[3][8 * 8];
 	CoeffToIDCT(&block[0], idct[0]);
@@ -2782,7 +2783,7 @@ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*
     }
 }
 
-void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+__device__ void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
     uchar idct[8 * 8];
     CoeffToIDCT(&block[0], &idct[0]);
@@ -2793,7 +2794,7 @@ void CoeffToYUV16x16(__private const coeff_t block[8 * 8], uchar *yuv, __global
     PixelToYUV(pixels, yuv, 16, 16);
 }
 
-void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
+__device__ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global const ushort *pixel_orig, int block_x, int block_y, int width_, int height_)
 {
     coeff_t b[8 * 8];
     for (int i = 0; i < 8 * 8; i++)
@@ -2803,7 +2804,7 @@ void CoeffToYUV16x16_g(__global const coeff_t block[8 * 8], uchar *yuv, __global
     CoeffToYUV16x16(b, yuv, pixel_orig, block_x, block_y, width_, height_);
 }
 
-void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv)
+__device__ void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv)
 {
     uchar idct[8 * 8];
     CoeffToIDCT(&block[0], &idct[0]);
@@ -2814,7 +2815,7 @@ void CoeffToYUV8x8(__private const coeff_t block[8 * 8], uchar *yuv)
     PixelToYUV(pixels, yuv, 8, 8);
 }
 
-void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv)
+__device__ void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv)
 {
     coeff_t b[8 * 8];
     for (int i = 0; i < 8 * 8; i++)
@@ -2825,7 +2826,7 @@ void CoeffToYUV8x8_g(__global const coeff_t block[8 * 8], uchar *yuv)
     CoeffToYUV8x8(b, yuv);
 }
 
-void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y)
+__device__ void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16], int off_x, int off_y)
 {
     for (int y = 0; y < 8; y++)
     {
@@ -2838,7 +2839,7 @@ void Copy8x8To16x16(const uchar yuv8x8[3 * 8 * 8], uchar yuv16x16[3 * 16 * 16],
     }
 }
 
-void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y)
+__device__ void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8], int off_x, int off_y)
 {
     for (int y = 0; y < 8; y++)
     {
@@ -2851,7 +2852,7 @@ void Copy16x16To8x8(const uchar yuv16x16[3 * 16 * 16], uchar yuv8x8[3 * 8 * 8],
     }
 }
 
-void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y)
+__device__ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float g[8 * 8], float b[8 * 8], int off_x, int off_y)
 {
     for (int y = 0; y < 8; y++)
     {
@@ -2866,7 +2867,7 @@ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 * 8], float
     }
 }
 
-void Convolution(size_t xsize, size_t ysize,
+__device__ void Convolution(size_t xsize, size_t ysize,
                  int xstep, int len, int offset,
                  const float* multipliers,
                  const float* inp,
@@ -2900,7 +2901,7 @@ void Convolution(size_t xsize, size_t ysize,
 
 // ian todo
 // �����������output
-void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
+__device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
 {
     // �ο�clBlurEx2��ʵ�֣�sigma = 1.1����ʱstep��diff�����ػ�Ϊ�̶�ֵ
 	const double sigma = 1.1;
@@ -2925,7 +2926,7 @@ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_r
 }
 
 // ian todo
-void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b,
+__device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b,
                             __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred,
                             int size)
 {
@@ -2955,7 +2956,7 @@ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private f
 }
 
 // chrisk todo
-void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
+__device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     float *xyb1_x, float *xyb1_y, float *xyb1_b,
     const float *c0_x, const float *c0_y, const float *c0_b,
     const float *c1_x, const float *c1_y, const float *c1_b,
@@ -3014,7 +3015,7 @@ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     }
 }
 
-void floatcopy(float *dst, const float *src, int size)
+__device__ void floatcopy(float *dst, const float *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -3022,7 +3023,7 @@ void floatcopy(float *dst, const float *src, int size)
     }
 }
 
-void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size)
+__device__ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -3030,7 +3031,7 @@ void coeffcopy_g(coeff_t *dst, __global const coeff_t *src, int size)
     }
 }
 
-void coeffcopy(coeff_t *dst, const coeff_t *src, int size)
+__device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size)
 {
     for (int i = 0; i < size; i++)
     {
@@ -3038,7 +3039,7 @@ void coeffcopy(coeff_t *dst, const coeff_t *src, int size)
     }
 }
 
-void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
+__device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
 {
     float rgb_blurred[3][kDCTBlockSize];
     for (int i = 0; i < 3; i++)
@@ -3048,7 +3049,7 @@ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
     OpsinDynamicsImageBlock(rgb[0], rgb[1], rgb[2], rgb_blurred[0], rgb_blurred[1], rgb_blurred[2], kDCTBlockSize);
 }
 
-double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
+__device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
 {
 //    CalcOpsinDynamicsImage(rgb0_c);
     CalcOpsinDynamicsImage(rgb1_c);
@@ -3093,7 +3094,7 @@ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private
 }
 
 // return the count of Non-zero item
-int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
+__device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order)
 {
     const int block_size = 64;
     int size = 0;
@@ -3110,7 +3111,7 @@ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8]
     return SortInputOrder(input_order->pData, size);
 }
 
-int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
+__device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
                  const __global float *orig_image_batch,
                  int width_, int height_,
                  int block_x, int block_y,
@@ -3135,7 +3136,7 @@ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
     return block_ix;
 }
 
-double CompareBlockFactor(const channel_info mayout_channel[3],
+__device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                           const coeff_t* candidate_block,
                           const int block_x,
                           const int block_y,
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index cf4d9212..8287e341 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -7,6 +7,9 @@
     #define __private
     #define __global
     #define __constant
+    #define __constant_ex
+    #define __device__
+
     typedef unsigned char uchar;
     typedef unsigned short ushort;
 
@@ -50,10 +53,12 @@
             };
         }ocl_channels;
     #endif
-#endif
+#endif /*__CUDACC__*/
 #endif /*__cplusplus*/
 
 #ifdef __OPENCL_VERSION__
+    #define __constant_ex __constant
+    #define __device__
     typedef union ocl_channels_t
     {
         struct
@@ -68,12 +73,22 @@
             float *ch[3];
         };
     }ocl_channels;
-
 #endif /*__OPENCL_VERSION__*/
 
 #ifdef __CUDACC__
+    #define __kernel    extern "C" __global__
+    #define __private
     #define __global
+    #define __constant  __constant__
+    #define __constant_ex
+    typedef unsigned char uchar;
     typedef unsigned short ushort;
+
+    __device__ int get_global_id(int dim);
+    __device__ int get_global_size(int dim);
+    void set_global_id(int dim, int id);
+    void set_global_size(int dim, int size);
+
 #endif /*__CUDACC__*/
 
     typedef short coeff_t;
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 0606793b..47cf78d1 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1235,8 +1235,8 @@ void cuScaleImage(float *img, size_t length, double scale)
 	void *args[2] = { &m, &scale};
 
 	CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
-        length, 1, 1,
-                   1, 1, 1,
+        1, 1, 1,
+                   length, 1, 1,
                    0,
                    ocu.stream, args, NULL);
 
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index 17b65143..d8591f5f 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1,7 +1,5 @@
-#include "clguetzli\clguetzli.cl.h"
+#include "clguetzli\clguetzli.cl"
 
-#ifdef __CUDACC__
-//#ifdef __OPENCL_VERSION__
 __device__ int get_global_id(int dim)
 {
     switch (dim)
@@ -16,11 +14,8 @@ __device__ int get_global_id(int dim)
         return threadIdx.x;
     }
 }
-#endif
 
-
-extern "C" __global__ void clScaleImageEx(float * img, double scale)
+__device__ int get_global_size(int dim)
 {
-    const int i = get_global_id(0);
-    img[i] = 0.0001;
+    return 0;
 }
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 7846afcb..32578c36 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -48,6 +48,7 @@ ocu_args_d_t& getOcu(void)
     char *ptx = new char[ptxSize];
     nvrtcGetPTX(prog, ptx);
 
+    nvrtcDestroyProgram(&prog);
     LogError("BuildInfo:\r\n%s\r\n", log);
 
     CUmodule mod;
@@ -61,7 +62,25 @@ ocu_args_d_t& getOcu(void)
     delete[] log;
     delete[] ptx;
 
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx");
     r = cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx");
+    r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx");
 
     cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED);
     cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE);
diff --git a/compile.bat b/compile.bat
index 05cdd472..8aa9430f 100644
--- a/compile.bat
+++ b/compile.bat
@@ -156,5 +156,4 @@ echo     %0 store 10.0.10240.0
 
 :end
 
-
-nvcc -Xcompiler "/wd 4819" -I"./" -arch=sm_30 -ptx -o clguetzli\clguetzli.cu.ptx30 clguetzli\clguetzli.cu
\ No newline at end of file
+nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine 64 -G -g -ptx -o clguetzli\clguetzli.cu.ptx64 clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index e31abaff..a9154d35 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -158,6 +158,9 @@
     <PostBuildEvent>
       <Command>compile.bat</Command>
     </PostBuildEvent>
+    <CustomBuild>
+      <Message>CUDA CU</Message>
+    </CustomBuild>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
@@ -392,12 +395,19 @@
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
     </Intel_OpenCL_Build_Rules>
-    <None Include="clguetzli\clguetzli.cu">
+    <CustomBuild Include="clguetzli\clguetzli.cu">
       <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)clguetzli\compile.bat</Command>
-    </None>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compile.bat</Command>
+      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkObjects>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cu.ptx</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
+      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
+    </CustomBuild>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />
     <None Include="third_party\zlib\match32.asm" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index bfdedbe0..17b8edf4 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -604,13 +604,15 @@
     <None Include="third_party\zlib\zlib.def">
       <Filter>third_party\zlib</Filter>
     </None>
-    <None Include="clguetzli\clguetzli.cu">
-      <Filter>clguetzli</Filter>
-    </None>
   </ItemGroup>
   <ItemGroup>
     <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
       <Filter>clguetzli</Filter>
     </Intel_OpenCL_Build_Rules>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="clguetzli\clguetzli.cu">
+      <Filter>clguetzli</Filter>
+    </CustomBuild>
+  </ItemGroup>
 </Project>
\ No newline at end of file

From 4533a020130709fb5690637779f2f28e170f8b97 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 2 Jun 2017 12:24:52 +0800
Subject: [PATCH 119/189] =?UTF-8?q?cuScaleImage=E8=B7=91=E9=80=9A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl.h | 23 +++++++++++++++++++----
 clguetzli/clguetzli.cpp  | 26 ++++++++++++++++----------
 clguetzli/clguetzli.cu   |  3 ++-
 clguetzli/ocu.cpp        | 32 ++++++++++++++++++--------------
 guetzli.vcxproj          |  5 ++++-
 5 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 8287e341..aeabed49 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -84,10 +84,25 @@
     typedef unsigned char uchar;
     typedef unsigned short ushort;
 
-    __device__ int get_global_id(int dim);
-    __device__ int get_global_size(int dim);
-    void set_global_id(int dim, int id);
-    void set_global_size(int dim, int size);
+    __device__ int get_global_id(int dim)
+    {
+        switch (dim)
+        {
+        case 0:  return blockIdx.x;
+        case 1:  return blockIdx.y;
+        default: return blockIdx.z;
+        }
+    }
+
+    __device__ int get_global_size(int dim)
+    {
+        switch(dim)
+        {
+        case 0: return gridDim.x;
+        case 1: return gridDim.y;
+        default: return gridDim.z;
+        }
+    }
 
 #endif /*__CUDACC__*/
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 47cf78d1..09158f55 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -339,7 +339,7 @@ void clConvolutionEx(
 	}
 }
 
-void clConvolutionX(
+void clConvolutionXEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
 	const cl_mem multipliers, size_t len,
@@ -375,7 +375,7 @@ void clConvolutionX(
 	}
 }
 
-void clConvolutionY(
+void clConvolutionYEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
 	const cl_mem multipliers, size_t len,
@@ -511,15 +511,15 @@ void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
 	if (xstep > 1)
 	{
 		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-		clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
         clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
 	}
 	else
 	{
 		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionX(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-		clConvolutionY(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
 	}
 
 	clReleaseMemObject(mem_expn);
@@ -1227,6 +1227,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 	clReleaseMemObject(blurred);
 }
 
+//////////////////////////////////////////////////////////////////////////////////////
 void cuScaleImage(float *img, size_t length, double scale)
 {
 	ocu_args_d_t &ocu = getOcu();
@@ -1235,10 +1236,10 @@ void cuScaleImage(float *img, size_t length, double scale)
 	void *args[2] = { &m, &scale};
 
 	CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
-        1, 1, 1,
-                   length, 1, 1,
-                   0,
-                   ocu.stream, args, NULL);
+                    length, 1, 1,
+                    1, 1, 1,
+                    0,
+                    ocu.stream, args, NULL);
 
     r = cuStreamSynchronize(ocu.stream);
 
@@ -1246,4 +1247,9 @@ void cuScaleImage(float *img, size_t length, double scale)
 
 	cuMemFree(m);
 	return;
+}
+
+void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
+{
+
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index d8591f5f..dbca9906 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1,5 +1,5 @@
 #include "clguetzli\clguetzli.cl"
-
+/*
 __device__ int get_global_id(int dim)
 {
     switch (dim)
@@ -19,3 +19,4 @@ __device__ int get_global_size(int dim)
 {
     return 0;
 }
+*/
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 32578c36..40b37225 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -9,6 +9,8 @@ ocu_args_d_t& getOcu(void)
 
     if (bInit == true) return ocu;
 
+    bInit = true;
+
     CUresult r = cuInit(0);
     CUdevice dev = 0;
     CUcontext ctxt;
@@ -29,18 +31,24 @@ ocu_args_d_t& getOcu(void)
 
     char* source = nullptr;
     size_t src_size = 0;
-    ReadSourceFromFile("clguetzli/clguetzli.cu", &source, &src_size);
+    ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
 
     nvrtcProgram prog;
-    const char *opts[] = { "-arch=compute_30", "--fmad=false" };
-    nvrtcCreateProgram(&prog, source, "clguetzli.cu", 0, NULL, NULL);
-    nvrtcCompileProgram(prog, 2, opts);
+    const char *opts[] = { "-arch=compute_30", "-default-device", "-G", "-I\"./\"", "--fmad=false" };
+    nvrtcCreateProgram(&prog, source, "clguetzli.cl", 0, NULL, NULL);
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, 3, opts);
+    if (NVRTC_SUCCESS != compile_result)
+    {
+        // Obtain compilation log from the program.
+        size_t logSize = 0;
+        nvrtcGetProgramLogSize(prog, &logSize);
+        char *log = new char[logSize];
+        nvrtcGetProgramLog(prog, log);
 
-    // Obtain compilation log from the program.
-    size_t logSize = 0;
-    nvrtcGetProgramLogSize(prog, &logSize);
-    char *log = new char[logSize];
-    nvrtcGetProgramLog(prog, log);
+        LogError("BuildInfo:\r\n%s\r\n", log);
+
+        delete[] log;
+    }
 
     // Obtain PTX from the program.
     size_t ptxSize = 0;
@@ -48,9 +56,6 @@ ocu_args_d_t& getOcu(void)
     char *ptx = new char[ptxSize];
     nvrtcGetPTX(prog, ptx);
 
-    nvrtcDestroyProgram(&prog);
-    LogError("BuildInfo:\r\n%s\r\n", log);
-
     CUmodule mod;
     CUjit_option jit_options[2];
     void *jit_optvals[2];
@@ -59,7 +64,6 @@ ocu_args_d_t& getOcu(void)
     r = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals);
 
     delete[] source;
-    delete[] log;
     delete[] ptx;
 
     r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx");
@@ -104,7 +108,7 @@ ocu_args_d_t::~ocu_args_d_t()
 {
     cuModuleUnload(mod);
     cuCtxDestroy(ctxt);
-    cuStreamDestroy(stream);
+//    cuStreamDestroy(stream);
 }
 
 CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init)
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index a9154d35..f2711884 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -394,10 +394,12 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
     </Intel_OpenCL_Build_Rules>
     <CustomBuild Include="clguetzli\clguetzli.cu">
       <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat</Command>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compile.bat</Command>
@@ -407,6 +409,7 @@
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
       <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
     </CustomBuild>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />

From 6240acea8875a451b168e8b0962528a0ce330b0a Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 2 Jun 2017 14:47:39 +0800
Subject: [PATCH 120/189] =?UTF-8?q?cuOpsinDynamicsImage=20=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp |   8 ++
 clguetzli/clguetzli.cl.h          |  43 +++++++++
 clguetzli/clguetzli.cpp           | 144 +++++++++++++++++++++++++++++-
 clguetzli/clguetzli.h             |   9 ++
 clguetzli/ocl.cpp                 |   3 +-
 clguetzli/ocl.h                   |   2 +-
 clguetzli/ocu.cpp                 |  24 ++++-
 clguetzli/ocu.h                   |   4 +-
 8 files changed, 229 insertions(+), 8 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index fd31632d..d3c1fdce 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -289,6 +289,14 @@ namespace butteraugli
 
             clOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
+        else if (g_useCuda && xsize > 100 && ysize > 100)
+        {
+            float * r = rgb[0].data();
+            float * g = rgb[1].data();
+            float * b = rgb[2].data();
+
+            cuOpsinDynamicsImage(r, g, b, xsize, ysize);
+        }
         else
         {
             std::vector< std::vector<float>> orig_rgb;
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index aeabed49..4e461399 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -1,6 +1,13 @@
 #ifndef __CLGUETZLI_CL_H__
 #define __CLGUETZLI_CL_H__
 
+#ifdef __cplusplus
+#ifndef __CUDACC__
+#include "CL\cl.h"
+#include "cuda.h"
+#endif
+#endif
+
 #ifdef __cplusplus
 #ifndef __CUDACC__
     #define __kernel
@@ -32,6 +39,20 @@
                 float *ch[3];
             };
         }ocl_channels;
+
+        typedef union ocu_channels_t
+        {
+            struct
+            {
+                float * r;
+                float * g;
+                float * b;
+            };
+            union
+            {
+                float *ch[3];
+            };
+        }ocu_channels;
     #else
         typedef union ocl_channels_t
         {
@@ -52,6 +73,26 @@
                 cl_mem ch[3];
             };
         }ocl_channels;
+
+        typedef union ocu_channels_t
+        {
+            struct
+            {
+                CUdeviceptr r;
+                CUdeviceptr g;
+                CUdeviceptr b;
+            };
+            struct
+            {
+                CUdeviceptr x;
+                CUdeviceptr y;
+                CUdeviceptr b_;
+            };
+            union
+            {
+                CUdeviceptr ch[3];
+            };
+        }ocu_channels;
     #endif
 #endif /*__CUDACC__*/
 #endif /*__cplusplus*/
@@ -59,6 +100,7 @@
 #ifdef __OPENCL_VERSION__
     #define __constant_ex __constant
     #define __device__
+/*
     typedef union ocl_channels_t
     {
         struct
@@ -73,6 +115,7 @@
             float *ch[3];
         };
     }ocl_channels;
+*/
 #endif /*__OPENCL_VERSION__*/
 
 #ifdef __CUDACC__
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 09158f55..02d4ed71 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -71,7 +71,7 @@ ocl_args_d_t& getOcl(void)
 
 void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
-    cl_int channel_size = xsize * ysize * sizeof(float);
+    size_t channel_size = xsize * ysize * sizeof(float);
 
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
@@ -379,8 +379,7 @@ void clConvolutionYEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
 	const cl_mem multipliers, size_t len,
-	int xstep, int offset, double border_ratio
-	)
+	int xstep, int offset, double border_ratio)
 {
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -1233,7 +1232,7 @@ void cuScaleImage(float *img, size_t length, double scale)
 	ocu_args_d_t &ocu = getOcu();
 	CUdeviceptr m = ocu.allocMem(length * sizeof(float), img);
 
-	void *args[2] = { &m, &scale};
+	void *args[] = { &m, &scale};
 
 	CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
                     length, 1, 1,
@@ -1251,5 +1250,142 @@ void cuScaleImage(float *img, size_t length, double scale)
 
 void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
+
+    cuOpsinDynamicsImageEx(rgb, xsize, ysize);
+
+    cuMemcpyDtoH(r, rgb.r, channel_size);
+    cuMemcpyDtoH(g, rgb.g, channel_size);
+    cuMemcpyDtoH(b, rgb.b, channel_size);
+
+    ocu.releaseMemChannels(rgb);
+}
+
+void cuConvolutionXEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuConvolutionYEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuSquareSampleEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &image, &xstep, &ystep};
+
+    err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    CUdeviceptr result/*out, opt*/)
+{
+    double m = 2.25;  // Accuracy increases when m is increased.
+    const double scaler = -1.0 / (2 * sigma * sigma);
+    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+    const int diff = std::max<int>(1, m * fabs(sigma));
+    const int expn_size = 2 * diff + 1;
+    std::vector<float> expn(expn_size);
+    for (int i = -diff; i <= diff; ++i) {
+        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+    }
+
+    const int xstep = std::max<int>(1, int(sigma / 3));
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+    CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
+
+    if (xstep > 1)
+    {
+        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        cuMemFree(srcA);
+    }
+    else
+    {
+        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuMemFree(srcA);
+    }
+
+    cuMemFree(mem_expn);
+}
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
+{
+    static const double kSigma = 1.1;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size);
+
+    cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+    cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+    cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+
+    void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b};
+
+    CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE],
+        xsize * ysize, 1, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, args, NULL);
+
+    r = cuStreamSynchronize(ocu.stream);
 
+    ocu.releaseMemChannels(rgb_blurred);
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 760677fd..40717d4f 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -135,8 +135,17 @@ void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int
 
 void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
 
+////////////////////////////////////////////////////////////////
+void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize);
+
 void cuScaleImage(float *img, size_t length, double scale);
 
+void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    CUdeviceptr result = NULL/*out, opt*/);
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
+
 class guetzli::OutputImage;
 
 namespace guetzli {
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 73a8d022..aecd900e 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -250,11 +250,12 @@ ocl_channels ocl_args_d_t::allocMemChannels(size_t s, const void *c0, const void
 	return img;
 }
 
-void ocl_args_d_t::releaseMemChannels(ocl_channels rgb)
+void ocl_args_d_t::releaseMemChannels(ocl_channels &rgb)
 {
     for (int i = 0; i < 3; i++)
     {
         clReleaseMemObject(rgb.ch[i]);
+        rgb.ch[i] = NULL;
     }
 }
 
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 04407f5c..37679770 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -78,7 +78,7 @@ struct ocl_args_d_t
 
 	cl_mem allocMem(size_t s, const void *init = NULL);
 	ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
-    void releaseMemChannels(ocl_channels rgb);
+    void releaseMemChannels(ocl_channels &rgb);
 
 	// Regular OpenCL objects:
 	cl_context       context;           // hold the context handler
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 40b37225..3a263c3d 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -125,4 +125,26 @@ CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init)
     }
 
     return mem;
-}
\ No newline at end of file
+}
+
+ocu_channels ocu_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2)
+{
+    const void *c[3] = { c0, c1, c2 };
+
+    ocu_channels img;
+    for (int i = 0; i < 3; i++)
+    {
+        img.ch[i] = allocMem(s, c[i]);
+    }
+
+    return img;
+}
+
+void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb)
+{
+    for (int i = 0; i < 3; i++)
+    {
+        cuMemFree(rgb.ch[i]);
+        rgb.ch[i] = NULL;
+    }
+}
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index 0ab97945..63a4bb47 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -12,7 +12,9 @@ struct ocu_args_d_t
     ocu_args_d_t();
     ~ocu_args_d_t();
 
-    CUdeviceptr allocMem(size_t s, const void *init);
+    CUdeviceptr allocMem(size_t s, const void *init = NULL);
+    ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
+    void releaseMemChannels(ocu_channels &rgb);
 
     CUfunction  kernel[KERNEL_COUNT];
     CUstream    stream;

From 49d74ab07e3d21b99803529cdff1daf09eecd177 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 2 Jun 2017 14:58:49 +0800
Subject: [PATCH 121/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=89=A9=E4=BD=99?=
 =?UTF-8?q?=E7=9A=84cu=E5=85=A5=E5=8F=A3=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp |  9 +----
 clguetzli/clguetzli.cpp           | 57 +++++++++++++++++++------------
 clguetzli/clguetzli.h             | 26 +++++++++++++-
 3 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index d3c1fdce..fa0e3920 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -229,14 +229,7 @@ namespace butteraugli
             result_org = *result;
         }
 
-        if (g_useCuda)
-        {
-            cuScaleImage(result->data(), result->size(), scale);
-        }
-        else
-        {
-            _ScaleImage(scale, result);
-        }
+        _ScaleImage(scale, result);
 
         if (g_checkOpenCL && result->size() > 64)
         {
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 02d4ed71..00449722 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -104,7 +104,6 @@ void clDiffmapOpsinDynamicsImage(
     size_t xsize, size_t ysize,
     size_t step)
 {
-
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
@@ -1227,27 +1226,6 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
-void cuScaleImage(float *img, size_t length, double scale)
-{
-	ocu_args_d_t &ocu = getOcu();
-	CUdeviceptr m = ocu.allocMem(length * sizeof(float), img);
-
-	void *args[] = { &m, &scale};
-
-	CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_SCALEIMAGE],
-                    length, 1, 1,
-                    1, 1, 1,
-                    0,
-                    ocu.stream, args, NULL);
-
-    r = cuStreamSynchronize(ocu.stream);
-
-    r = cuMemcpyDtoH(img, m, length * sizeof(float));
-
-	cuMemFree(m);
-	return;
-}
-
 void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -1265,6 +1243,41 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
     ocu.releaseMemChannels(rgb);
 }
 
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+
+}
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit)
+{
+
+}
+
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2)
+{
+
+}
+
 void cuConvolutionXEx(
     CUdeviceptr result/*out*/,
     const CUdeviceptr inp, size_t xsize, size_t ysize,
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 40717d4f..61743e3d 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -138,7 +138,31 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 ////////////////////////////////////////////////////////////////
 void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize);
 
-void cuScaleImage(float *img, size_t length, double scale);
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit);
+
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2);
 
 void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
     const double sigma, const double border_ratio,

From 63ac0642b687ed4cd0ef16c8a577c6c6c6582a4c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 2 Jun 2017 15:15:00 +0800
Subject: [PATCH 122/189] =?UTF-8?q?=E7=AE=80=E5=8C=96=E7=82=B9=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E5=96=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 appveyor.yml                      |  32 --
 clguetzli/clbutter_comparator.cpp |   6 +-
 clguetzli/clguetzli.cl            |  74 +--
 clguetzli/clguetzli.cl.cpp        |   2 +-
 clguetzli/clguetzli.cpp           |  69 +--
 clguetzli/clguetzli.h             |  14 +-
 clguetzli/clguetzli_test.cpp      |   6 +-
 clguetzli/ocl.cpp                 |   6 +-
 clguetzli/ocl.h                   |   1 +
 clguetzli/utils.h                 |   1 +
 guetzli.vcxproj                   | 568 +++++++++---------
 guetzli.vcxproj.filters           | 928 +++++++++++++++---------------
 guetzli/butteraugli_comparator.cc |   2 +-
 guetzli/guetzli.cc                |   3 +-
 guetzli/processor.cc              |   8 +-
 guetzli/processor.h               |   2 +-
 guetzli_static.vcxproj            | 110 ++--
 guetzli_static.vcxproj.filters    | 312 +++++-----
 tests/golden_checksums.txt        |  20 +-
 19 files changed, 1054 insertions(+), 1110 deletions(-)
 delete mode 100644 appveyor.yml

diff --git a/appveyor.yml b/appveyor.yml
deleted file mode 100644
index 061ab6d0..00000000
--- a/appveyor.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-version: '1.0.1#{build}'
-
-shallow_clone: true
-
-os:
-  - Visual Studio 2015
-
-environment:
-  matrix:
-  - TOOLSET: vs2015
-
-install:
-  - ps: Start-FileDownload 'https://github.com/premake/premake-core/releases/download/v5.0.0-alpha11/premake-5.0.0-alpha11-windows.zip' 'premake.zip'
-  - 7z x premake.zip
-  - premake5.exe %TOOLSET%
-  - git clone https://github.com/Microsoft/vcpkg
-  - md vcpkg\downloads\nuget-3.5.0
-  - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe
-  - appveyor DownloadFile https://cmake.org/files/v3.8/cmake-3.8.0-rc1-win32-x86.zip -FileName %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip
-  - 7z x %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip
-  - cd vcpkg
-  - powershell -exec bypass -File scripts\bootstrap.ps1
-  - vcpkg integrate install
-  - vcpkg install libpng
-  - cd ..
-
-configuration:
-  - Debug
-  - Release
-
-build:
-  project: guetzli.sln
diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index fa0e3920..64c0a3dd 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -53,7 +53,7 @@ namespace butteraugli
         {
             tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-                xsize_, ysize_, step_,
+                xsize_, ysize_, step_, 
                 (*edge_detector_map).data());
         }
     }
@@ -104,7 +104,7 @@ namespace butteraugli
         }
     }
 
-    void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values)
+    void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) 
     {
         std::vector<float> img;
         if (g_checkOpenCL && xsize > 8 && ysize > 8)
@@ -305,6 +305,6 @@ namespace butteraugli
                 tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize,
                     rgb[0].data(), rgb[1].data(), rgb[2].data());
             }
-        }
+        }  
     }
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index ec04630d..1e026fa9 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -16,7 +16,7 @@ typedef struct __IntFloatPair
     float err;
 }IntFloatPair, DCTScoreData, CoeffData;
 
-typedef struct __IntFloatPairList
+typedef struct __IntFloatPairList 
 {
     int size;
     IntFloatPair *pData;
@@ -48,9 +48,9 @@ __device__ void Butteraugli8x8CornerEdgeDetectorDiff(
 __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order);
 
 __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
-                        const coeff_t* candidate_block,
-                        const int block_x,
-                        const int block_y,
+                        const coeff_t* candidate_block, 
+                        const int block_x, 
+                        const int block_y, 
                         __global const float *orig_image_batch,
                         __global const float *mask_scale,
                         const int image_width,
@@ -65,7 +65,7 @@ __device__ int list_push_back(IntFloatPairList* list, int i, float f);
 
 __kernel void clConvolutionEx(
 	__global float* result,
-	__global const float* inp, const int xsize,
+	__global const float* inp, const int xsize, 
 	__global const float* multipliers, const int len,
     const int xstep, const int offset, const float border_ratio)
 {
@@ -107,7 +107,7 @@ __kernel void clConvolutionEx(
 __kernel void clConvolutionXEx(
 	__global float* result,
 	__global const float* inp,
-	__global const float* multipliers, const int len,
+	__global const float* multipliers, const int len, 
 	const int step, const int offset, const float border_ratio)
 {
     const int x = get_global_id(0);
@@ -147,8 +147,8 @@ __kernel void clConvolutionXEx(
 
 __kernel void clConvolutionYEx(
 	__global float* result,
-	__global const float* inp,
-	__global const float* multipliers, const int len,
+	__global const float* inp, 
+	__global const float* multipliers, const int len, 
     const int step, const int offset, const float border_ratio)
 {
     const int x = get_global_id(0);
@@ -189,7 +189,7 @@ __kernel void clConvolutionYEx(
 
 __kernel void clSquareSampleEx(
 	__global float* result,
-	__global const float* image,
+	__global const float* image, 
 	const int xstep, const int ystep)
 {
     const int x = get_global_id(0);
@@ -528,7 +528,7 @@ __kernel void clAverage5x5Ex(__global float *img, __global const float *img_org)
     const int y = get_global_id(1);
     const int xsize = get_global_size(0);
     const int ysize = get_global_size(1);
-
+	
     const int row0 = y * xsize;
 	if (x - 1 >= 0) {
 		img[row0 + x] += img_org[row0 + x - 1];
@@ -707,7 +707,7 @@ __kernel void clAddBorderEx(__global float *out, int s, int s2, __global const f
 
 	if (x >= xsize - s ||
 	    y >= ysize - s)
-	{
+	{ 
 		return;
 	}
 
@@ -803,8 +803,8 @@ __kernel void clComputeBlockZeroingOrderEx(
                                                block_y,
                                                orig_image_batch,
                                                mask_scale,
-                                               image_width,
-                                               image_height,
+                                               image_width, 
+                                               image_height, 
                                                factor);
             if (max_err < best_err)
             {
@@ -2868,12 +2868,12 @@ __device__ void Copy16x16ToChannel(const float rgb16x16[3][16 * 16], float r[8 *
 }
 
 __device__ void Convolution(size_t xsize, size_t ysize,
-                 int xstep, int len, int offset,
-                 const float* multipliers,
-                 const float* inp,
+                 int xstep, int len, int offset, 
+                 const float* multipliers, 
+                 const float* inp, 
                  float border_ratio,
                  float* result)
-{
+{ 
 	float weight_no_border = 0;
 
 	for (size_t j = 0; j <= 2 * offset; ++j) {
@@ -2909,17 +2909,17 @@ __device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, doub
 	const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772
 	const int diff = 2;  // when sigma=1.1, diff's value is 2.
 	const int expn_size = 5; // when sigma=1.1, scaler is  5
-	float expn[5] = { exp(scaler * (-diff) * (-diff)),
-							  exp(scaler * (-diff + 1) * (-diff + 1)),
+	float expn[5] = { exp(scaler * (-diff) * (-diff)), 
+							  exp(scaler * (-diff + 1) * (-diff + 1)), 
 							  exp(scaler * (-diff + 2) * (-diff + 2)),
 							  exp(scaler * (-diff + 3) * (-diff + 3)),
-							  exp(scaler * (-diff + 4) * (-diff + 4))};
+							  exp(scaler * (-diff + 4) * (-diff + 4))};				  
 	const int xstep = 1; // when sigma=1.1, xstep is 1.
 	const int ystep = xstep;
 
 	int dxsize = (xsize + xstep - 1) / xstep;
 
-	float tmp[8*8] = { 0 };
+	float tmp[8*8] = { 0 }; 
 	Convolution(xsize, ysize, xstep, expn_size, diff, expn, r, border_ratio, tmp);
 	Convolution(ysize, dxsize, ystep, expn_size, diff, expn, tmp,
               border_ratio, output);
@@ -3050,7 +3050,7 @@ __device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
 }
 
 __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
-{
+{ 
 //    CalcOpsinDynamicsImage(rgb0_c);
     CalcOpsinDynamicsImage(rgb1_c);
 
@@ -3067,7 +3067,7 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
                                 8, 8);
 
     // ����ΪɶҪ��floatת��double���ܼ��������㣿
-    double b0[3 * kDCTBlockSize];       //
+    double b0[3 * kDCTBlockSize];       // 
     double b1[3 * kDCTBlockSize];
     for (int c = 0; c < 3; ++c) {
         for (int ix = 0; ix < kDCTBlockSize; ++ix) {
@@ -3107,14 +3107,14 @@ __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_b
             }
         }
     }
-
+ 
     return SortInputOrder(input_order->pData, size);
 }
 
 __device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
-                 const __global float *orig_image_batch,
+                 const __global float *orig_image_batch, 
                  int width_, int height_,
-                 int block_x, int block_y,
+                 int block_x, int block_y, 
                  int factor,
                  int off_x, int off_y)
 {
@@ -3137,9 +3137,9 @@ __device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
 }
 
 __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
-                          const coeff_t* candidate_block,
-                          const int block_x,
-                          const int block_y,
+                          const coeff_t* candidate_block, 
+                          const int block_x, 
+                          const int block_y, 
                           __global const float *orig_image_batch,
                           __global const float *mask_scale,
                           const int image_width,
@@ -3183,7 +3183,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                 }
             }
         }
-        else {
+        else { 
             if (factor == 1) {
                 int block_xx = block_x / mayout_channel[c].factor;
                 int block_yy = block_y / mayout_channel[c].factor;
@@ -3192,9 +3192,9 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
 
                 int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
                 __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
-
-                CoeffToYUV16x16_g(coeff_block, &yuv16x16[c],
-                    mayout_channel[c].pixel, block_xx, block_yy,
+               
+                CoeffToYUV16x16_g(coeff_block, &yuv16x16[c], 
+                    mayout_channel[c].pixel, block_xx, block_yy, 
                     image_width,
                     image_height);
 
@@ -3203,9 +3203,9 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
             }
             else {
                 const coeff_t * coeff_block = candidate_channel[c];
-                CoeffToYUV16x16(coeff_block, &yuv16x16[c],
-                    mayout_channel[c].pixel, block_x, block_y,
-                    image_width,
+                CoeffToYUV16x16(coeff_block, &yuv16x16[c], 
+                    mayout_channel[c].pixel, block_x, block_y, 
+                    image_width, 
                     image_height);
             }
         }
@@ -3243,7 +3243,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                 {
                     continue;
                 }
-
+                
                 float rgb0_c[3][kDCTBlockSize];
                 int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy);
 
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index cafb0bf7..a18cd110 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -92,7 +92,7 @@ namespace guetzli
         imgOpsinDynamicsBlockList.clear();
         imgMaskXyzScaleBlockList.clear();
     }
-
+    
     double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
         double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 00449722..7b2cd995 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -79,21 +79,11 @@ void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
 
     clOpsinDynamicsImageEx(rgb, xsize, ysize);
 
-    cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *result_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *result_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, rgb.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-
+    clEnqueueReadBuffer(ocl.commandQueue, rgb.r, false, 0, channel_size, r, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, rgb.g, false, 0, channel_size, g, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, rgb.b, false, 0, channel_size, b, 0, NULL, NULL);
     err = clFinish(ocl.commandQueue);
 
-    memcpy(r, result_r, channel_size);
-    memcpy(g, result_g, channel_size);
-    memcpy(b, result_b, channel_size);
-
-    clEnqueueUnmapMemObject(ocl.commandQueue, rgb.r, result_r, 0, NULL, NULL);
-    clEnqueueUnmapMemObject(ocl.commandQueue, rgb.g, result_g, 0, NULL, NULL);
-    clEnqueueUnmapMemObject(ocl.commandQueue, rgb.b, result_b, 0, NULL, NULL);
-    clFinish(ocl.commandQueue);
-
     ocl.releaseMemChannels(rgb);
 }
 
@@ -138,12 +128,8 @@ void clDiffmapOpsinDynamicsImage(
 
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
-    cl_float *result_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mem_result, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL);
     err = clFinish(ocl.commandQueue);
-    memcpy(result, result_r, channel_size);
-
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_result, result_r, 0, NULL, NULL);
-    clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(xyb1);
     ocl.releaseMemChannels(xyb0);
@@ -238,11 +224,7 @@ void clComputeBlockZeroingOrder(
         LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
     }
 
-    CoeffData *result = (CoeffData *)clEnqueueMapBuffer(ocl.commandQueue, mem_output_order_batch, true, CL_MAP_READ, 0, output_order_batch_size, 0, NULL, NULL, &err);
-    err = clFinish(ocl.commandQueue);
-    memcpy(output_order_batch, result, output_order_batch_size);
-
-    clEnqueueUnmapMemObject(ocl.commandQueue, mem_output_order_batch, result, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mem_output_order_batch, false, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL);
     clFinish(ocl.commandQueue);
 
     for (int c = 0; c < 3; c++)
@@ -277,21 +259,14 @@ void clMask(
 
     clMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
 
-    cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r0_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r0_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r1_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r1_g = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.g, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
-    cl_float *r1_b = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask_dc.b, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
+    clEnqueueReadBuffer(ocl.commandQueue, mask.r, false, 0, channel_size, mask_r, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask.g, false, 0, channel_size, mask_g, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask.b, false, 0, channel_size, mask_b, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL);
+    clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL);
     err = clFinish(ocl.commandQueue);
 
-    memcpy(mask_r, r0_r, channel_size);
-    memcpy(mask_g, r0_g, channel_size);
-    memcpy(mask_b, r0_b, channel_size);
-    memcpy(maskdc_r, r1_r, channel_size);
-    memcpy(maskdc_g, r1_g, channel_size);
-    memcpy(maskdc_b, r1_b, channel_size);
-
     ocl.releaseMemChannels(rgb);
     ocl.releaseMemChannels(rgb2);
     ocl.releaseMemChannels(mask);
@@ -614,7 +589,7 @@ void clMaskHighIntensityChangeEx(
 
 void clEdgeDetectorMapEx(
     cl_mem result/*out*/,
-    const ocl_channels &rgb, const ocl_channels &rgb2,
+    const ocl_channels &rgb, const ocl_channels &rgb2, 
     const size_t xsize, const size_t ysize, const size_t step)
 {
 	cl_int channel_size = xsize * ysize * sizeof(float);
@@ -625,7 +600,7 @@ void clEdgeDetectorMapEx(
 	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
 	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
 
-	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+ 	static const double kSigma[3] = { 1.5, 0.586, 0.4 };
 
 	for (int i = 0; i < 3; i++)
 	{
@@ -669,7 +644,7 @@ void clEdgeDetectorMapEx(
 }
 
 void clBlockDiffMapEx(
-    cl_mem block_diff_dc/*out*/,
+    cl_mem block_diff_dc/*out*/, 
     cl_mem block_diff_ac/*out*/,
     const ocl_channels &rgb, const ocl_channels &rgb2,
 	const size_t xsize, const size_t ysize, const size_t step)
@@ -767,7 +742,7 @@ void clEdgeDetectorLowFreqEx(
 
 void clDiffPrecomputeEx(
     ocl_channels &mask/*out*/,
-    const ocl_channels &xyb0, const ocl_channels &xyb1,
+    const ocl_channels &xyb0, const ocl_channels &xyb1, 
     const size_t xsize, const size_t ysize)
 {
 	cl_int err = CL_SUCCESS;
@@ -855,8 +830,8 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize
 }
 
 void clMinSquareValEx(
-    cl_mem img/*in,out*/,
-    const size_t xsize, const size_t ysize,
+    cl_mem img/*in,out*/, 
+    const size_t xsize, const size_t ysize, 
     const size_t square_size, const size_t offset)
 {
 	cl_int err = CL_SUCCESS;
@@ -1056,12 +1031,12 @@ void clMaskEx(
 
 void clCombineChannelsEx(
     cl_mem result/*out*/,
-	const ocl_channels &mask,
-	const ocl_channels &mask_dc,
+	const ocl_channels &mask, 
+	const ocl_channels &mask_dc, 
     const size_t xsize, const size_t ysize,
-	const cl_mem block_diff_dc,
-	const cl_mem block_diff_ac,
-	const cl_mem edge_detector_map,
+	const cl_mem block_diff_dc, 
+	const cl_mem block_diff_ac, 
+	const cl_mem edge_detector_map, 
 	const size_t res_xsize,
 	const size_t step)
 {
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 61743e3d..8407a1c5 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -11,7 +11,7 @@ extern bool g_useCuda;
 extern bool g_checkOpenCL;
 
 void clOpsinDynamicsImage(
-    float *r, float *g, float *b,
+    float *r, float *g, float *b, 
     const size_t xsize, const size_t ysize);
 
 void clDiffmapOpsinDynamicsImage(
@@ -36,7 +36,7 @@ void clComputeBlockZeroingOrder(
 
 void clMask(
     float* mask_r,   float* mask_g,   float* mask_b,
-    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b, 
     const size_t xsize, const size_t ysize,
     const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2);
@@ -48,7 +48,7 @@ void clConvolutionEx(
     int xstep, int offset, double border_ratio);
 
 void clConvolutionXEx(
-    cl_mem result/*out*/,
+    cl_mem result/*out*/, 
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
     int xstep, int offset, double border_ratio);
@@ -85,7 +85,7 @@ void clEdgeDetectorMapEx(
     const size_t xsize, const size_t ysize, const size_t step);
 
 void clBlockDiffMapEx(
-    cl_mem block_diff_dc/*out*/,
+    cl_mem block_diff_dc/*out*/, 
     cl_mem block_diff_ac/*out*/,
     const ocl_channels &rgb, const ocl_channels &rgb2,
 	const size_t xsize, const size_t ysize, const size_t step);
@@ -97,7 +97,7 @@ void clEdgeDetectorLowFreqEx(
 
 void clDiffPrecomputeEx(
     ocl_channels &mask/*out*/,
-    const ocl_channels &xyb0, const ocl_channels &xyb1,
+    const ocl_channels &xyb0, const ocl_channels &xyb1, 
     const size_t xsize, const size_t ysize);
 
 void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
@@ -105,8 +105,8 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w);
 void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize);
 
 void clMinSquareValEx(
-    cl_mem img/*in,out*/,
-    const size_t xsize, const size_t ysize,
+    cl_mem img/*in,out*/, 
+    const size_t xsize, const size_t ysize, 
     const size_t square_size, const size_t offset);
 
 void clMaskEx(
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 15c1317b..2cadfb85 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -90,7 +90,7 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	err = clFinish(ocl.commandQueue);
 
 	FLOAT_COMPARE(result, r_r, res_xsize * res_ysize * 3);
-
+	
 	clEnqueueUnmapMemObject(ocl.commandQueue, edge, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
@@ -114,7 +114,7 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	ocl_args_d_t &ocl = getOcl();
 	ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
 	ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-
+	
 	cl_mem block_diff_dc = ocl.allocMem(reschannel_size);
 	cl_mem block_diff_ac = ocl.allocMem(reschannel_size);
 
@@ -187,7 +187,7 @@ void tclMask(const float* r, const float* g, const float* b,
 
 	ocl_channels mask = ocl.allocMemChannels(channel_size);
 	ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
-
+    	
 	clMaskEx(mask/*out*/, mask_dc/*out*/, rgb, rgb2, xsize, ysize);
 
 	cl_float *r0_r = (cl_float *)clEnqueueMapBuffer(ocl.commandQueue, mask.r, true, CL_MAP_READ, 0, channel_size, 0, NULL, NULL, &err);
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index aecd900e..d92fb1a4 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -184,7 +184,7 @@ void* ocl_args_d_t::allocC(size_t s)
 	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
 	outputC = _aligned_malloc(optimizedSize, 4096);
 	lenC = s;
-
+	
 	cl_int err = 0;
 	dstMem = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, outputC, &err);
 	if (CL_SUCCESS != err)
@@ -204,7 +204,7 @@ cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
 		LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err));
 	}
     if (!mem) return NULL;
-
+    
     // init memory
     if (init)
     {
@@ -323,7 +323,7 @@ const char* TranslateOpenCLError(cl_int errorCode)
 	case CL_INVALID_LINKER_OPTIONS:             return "CL_INVALID_LINKER_OPTIONS";                             //-67
 	case CL_INVALID_DEVICE_PARTITION_COUNT:     return "CL_INVALID_DEVICE_PARTITION_COUNT";                     //-68
 																												//    case CL_INVALID_PIPE_SIZE:                  return "CL_INVALID_PIPE_SIZE";                                  //-69
-																												//    case CL_INVALID_DEVICE_QUEUE:               return "CL_INVALID_DEVICE_QUEUE";                               //-70
+																												//    case CL_INVALID_DEVICE_QUEUE:               return "CL_INVALID_DEVICE_QUEUE";                               //-70    
 
 	default:
 		return "UNKNOWN ERROR CODE";
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 37679770..fd7e78e7 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -104,3 +104,4 @@ struct ocl_args_d_t
 	void*			 outputC;
 	size_t			 lenC;
 };
+
diff --git a/clguetzli/utils.h b/clguetzli/utils.h
index fc68fec5..71d8d7a1 100644
--- a/clguetzli/utils.h
+++ b/clguetzli/utils.h
@@ -29,3 +29,4 @@ void LogError(const char* str, ...);
 
 // Read OpenCL source code from fileName and store it in source. The number of read bytes returns in sourceSize
 int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize);
+
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index f2711884..b8798eb2 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -51,7 +51,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -79,8 +79,8 @@
     <IntDir>obj\x86\Release\guetzli\</IntDir>
     <TargetName>guetzli</TargetName>
     <TargetExt>.exe</TargetExt>
-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
-    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
@@ -95,121 +95,121 @@
     <IntDir>obj\x86\Debug\guetzli\</IntDir>
     <TargetName>guetzli</TargetName>
     <TargetExt>.exe</TargetExt>
-    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
-    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
+    <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);$(VC_IncludeThirdParty)</IncludePath>
+    <LibraryPath>$(VC_LibraryPath_x86);$(WindowsSDK_LibraryPath_x86);$(NETFXKitsDir)Lib\um\x86;$(VC_LibThirdParty_x86)</LibraryPath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>Full</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
-    </Link>
-    <CustomBuild>
-      <Command>"$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current            -bo="           "</Command>
-    </CustomBuild>
-    <CustomBuild>
-      <Message>OpenCL Code Builder</Message>
-    </CustomBuild>
-    <CustomBuild>
-      <LinkObjects>false</LinkObjects>
-    </CustomBuild>
-    <Intel_OpenCL_Build_Rules />
-    <PostBuildEvent />
-    <PreBuildEvent />
-    <PreBuildEvent />
-    <PreBuildEvent />
-    <PostBuildEvent />
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>false</IntrinsicFunctions>
-      <MinimalRebuild>false</MinimalRebuild>
-      <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent>
-      <Command>compile.bat</Command>
-    </PostBuildEvent>
-    <CustomBuild>
-      <Message>CUDA CU</Message>
-    </CustomBuild>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent />
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent />
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClInclude Include="clguetzli\clbutter_comparator.h" />
-    <ClInclude Include="clguetzli\clguetzli.cl.h" />
-    <ClInclude Include="clguetzli\clguetzli.h" />
-    <ClInclude Include="clguetzli\clguetzli_test.h" />
-    <ClInclude Include="clguetzli\ocl.h" />
-    <ClInclude Include="clguetzli\ocu.h" />
-    <ClInclude Include="clguetzli\utils.h" />
-    <ClInclude Include="guetzli\butteraugli_comparator.h" />
-    <ClInclude Include="guetzli\color_transform.h" />
-    <ClInclude Include="guetzli\comparator.h" />
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
+    </Link>
+    <CustomBuild>
+      <Command>"$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current            -bo="           "</Command>
+    </CustomBuild>
+    <CustomBuild>
+      <Message>OpenCL Code Builder</Message>
+    </CustomBuild>
+    <CustomBuild>
+      <LinkObjects>false</LinkObjects>
+    </CustomBuild>
+    <Intel_OpenCL_Build_Rules />
+    <PostBuildEvent />
+    <PreBuildEvent />
+    <PreBuildEvent />
+    <PreBuildEvent />
+    <PostBuildEvent />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <MinimalRebuild>false</MinimalRebuild>
+      <StringPooling>true</StringPooling>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent>
+      <Command>compile.bat</Command>
+    </PostBuildEvent>
+    <CustomBuild>
+      <Message>CUDA CU</Message>
+    </CustomBuild>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
+      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
+    </Link>
+    <PostBuildEvent />
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="clguetzli\clbutter_comparator.h" />
+    <ClInclude Include="clguetzli\clguetzli.cl.h" />
+    <ClInclude Include="clguetzli\clguetzli.h" />
+    <ClInclude Include="clguetzli\clguetzli_test.h" />
+    <ClInclude Include="clguetzli\ocl.h" />
+    <ClInclude Include="clguetzli\ocu.h" />
+    <ClInclude Include="clguetzli\utils.h" />
+    <ClInclude Include="guetzli\butteraugli_comparator.h" />
+    <ClInclude Include="guetzli\color_transform.h" />
+    <ClInclude Include="guetzli\comparator.h" />
     <ClInclude Include="guetzli\dct_double.h" />
     <ClInclude Include="guetzli\debug_print.h" />
     <ClInclude Include="guetzli\entropy_encode.h" />
@@ -230,84 +230,84 @@
     <ClInclude Include="guetzli\processor.h" />
     <ClInclude Include="guetzli\quality.h" />
     <ClInclude Include="guetzli\quantize.h" />
-    <ClInclude Include="guetzli\score.h" />
-    <ClInclude Include="guetzli\stats.h" />
-    <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h" />
-    <ClInclude Include="third_party\libpng\png.h" />
-    <ClInclude Include="third_party\libpng\pngconf.h" />
-    <ClInclude Include="third_party\libpng\pngpriv.h" />
-    <ClInclude Include="third_party\zlib\crc32.h" />
-    <ClInclude Include="third_party\zlib\deflate.h" />
-    <ClInclude Include="third_party\zlib\gzguts.h" />
-    <ClInclude Include="third_party\zlib\inffast.h" />
-    <ClInclude Include="third_party\zlib\inffixed.h" />
-    <ClInclude Include="third_party\zlib\inflate.h" />
-    <ClInclude Include="third_party\zlib\inftrees.h" />
-    <ClInclude Include="third_party\zlib\trees.h" />
-    <ClInclude Include="third_party\zlib\zconf.h" />
-    <ClInclude Include="third_party\zlib\zlib.h" />
-    <ClInclude Include="third_party\zlib\zutil.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="clguetzli\clbutter_comparator.cpp" />
-    <ClCompile Include="clguetzli\clguetzli.cl.cpp" />
-    <ClCompile Include="clguetzli\clguetzli.cpp" />
-    <ClCompile Include="clguetzli\clguetzli_test.cpp" />
-    <ClCompile Include="clguetzli\ocl.cpp" />
-    <ClCompile Include="clguetzli\ocu.cpp" />
-    <ClCompile Include="clguetzli\utils.cpp" />
-    <ClCompile Include="guetzli\butteraugli_comparator.cc" />
-    <ClCompile Include="guetzli\dct_double.cc" />
-    <ClCompile Include="guetzli\debug_print.cc" />
+    <ClInclude Include="guetzli\score.h" />
+    <ClInclude Include="guetzli\stats.h" />
+    <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h" />
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h" />
+    <ClInclude Include="third_party\libpng\png.h" />
+    <ClInclude Include="third_party\libpng\pngconf.h" />
+    <ClInclude Include="third_party\libpng\pngpriv.h" />
+    <ClInclude Include="third_party\zlib\crc32.h" />
+    <ClInclude Include="third_party\zlib\deflate.h" />
+    <ClInclude Include="third_party\zlib\gzguts.h" />
+    <ClInclude Include="third_party\zlib\inffast.h" />
+    <ClInclude Include="third_party\zlib\inffixed.h" />
+    <ClInclude Include="third_party\zlib\inflate.h" />
+    <ClInclude Include="third_party\zlib\inftrees.h" />
+    <ClInclude Include="third_party\zlib\trees.h" />
+    <ClInclude Include="third_party\zlib\zconf.h" />
+    <ClInclude Include="third_party\zlib\zlib.h" />
+    <ClInclude Include="third_party\zlib\zutil.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp" />
+    <ClCompile Include="clguetzli\clguetzli.cl.cpp" />
+    <ClCompile Include="clguetzli\clguetzli.cpp" />
+    <ClCompile Include="clguetzli\clguetzli_test.cpp" />
+    <ClCompile Include="clguetzli\ocl.cpp" />
+    <ClCompile Include="clguetzli\ocu.cpp" />
+    <ClCompile Include="clguetzli\utils.cpp" />
+    <ClCompile Include="guetzli\butteraugli_comparator.cc" />
+    <ClCompile Include="guetzli\dct_double.cc" />
+    <ClCompile Include="guetzli\debug_print.cc" />
     <ClCompile Include="guetzli\entropy_encode.cc" />
     <ClCompile Include="guetzli\fdct.cc" />
     <ClCompile Include="guetzli\gamma_correct.cc" />
@@ -323,102 +323,102 @@
     <ClCompile Include="guetzli\preprocess_downsample.cc" />
     <ClCompile Include="guetzli\processor.cc" />
     <ClCompile Include="guetzli\quality.cc" />
-    <ClCompile Include="guetzli\quantize.cc" />
-    <ClCompile Include="guetzli\score.cc" />
-    <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc" />
-    <ClCompile Include="third_party\libpng\png.c" />
-    <ClCompile Include="third_party\libpng\pngerror.c" />
-    <ClCompile Include="third_party\libpng\pngget.c" />
-    <ClCompile Include="third_party\libpng\pngmem.c" />
-    <ClCompile Include="third_party\libpng\pngpread.c" />
-    <ClCompile Include="third_party\libpng\pngread.c" />
-    <ClCompile Include="third_party\libpng\pngrio.c" />
-    <ClCompile Include="third_party\libpng\pngrtran.c" />
-    <ClCompile Include="third_party\libpng\pngrutil.c" />
-    <ClCompile Include="third_party\libpng\pngset.c" />
-    <ClCompile Include="third_party\libpng\pngtrans.c" />
-    <ClCompile Include="third_party\libpng\pngwio.c" />
-    <ClCompile Include="third_party\libpng\pngwrite.c" />
-    <ClCompile Include="third_party\libpng\pngwtran.c" />
-    <ClCompile Include="third_party\libpng\pngwutil.c" />
-    <ClCompile Include="third_party\zlib\adler32.c" />
-    <ClCompile Include="third_party\zlib\compress.c" />
-    <ClCompile Include="third_party\zlib\crc32.c" />
-    <ClCompile Include="third_party\zlib\deflate.c" />
-    <ClCompile Include="third_party\zlib\gzclose.c" />
-    <ClCompile Include="third_party\zlib\gzlib.c" />
-    <ClCompile Include="third_party\zlib\gzread.c" />
-    <ClCompile Include="third_party\zlib\gzwrite.c" />
-    <ClCompile Include="third_party\zlib\infback.c" />
-    <ClCompile Include="third_party\zlib\inffast.c" />
-    <ClCompile Include="third_party\zlib\inflate.c" />
-    <ClCompile Include="third_party\zlib\inftrees.c" />
-    <ClCompile Include="third_party\zlib\trees.c" />
-    <ClCompile Include="third_party\zlib\uncompr.c" />
-    <ClCompile Include="third_party\zlib\zutil.c" />
-  </ItemGroup>
-  <ItemGroup>
-    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-      </Command>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-    </Intel_OpenCL_Build_Rules>
-    <CustomBuild Include="clguetzli\clguetzli.cu">
-      <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compile.bat</Command>
-      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkObjects>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cu.ptx</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
-      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <None Include="third_party\libpng\pngwin.def" />
-    <None Include="third_party\zlib\inffas32.asm" />
-    <None Include="third_party\zlib\match32.asm" />
-    <None Include="third_party\zlib\match686.asm" />
-    <None Include="third_party\zlib\zlib.def" />
+    <ClCompile Include="guetzli\quantize.cc" />
+    <ClCompile Include="guetzli\score.cc" />
+    <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc" />
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc" />
+    <ClCompile Include="third_party\libpng\png.c" />
+    <ClCompile Include="third_party\libpng\pngerror.c" />
+    <ClCompile Include="third_party\libpng\pngget.c" />
+    <ClCompile Include="third_party\libpng\pngmem.c" />
+    <ClCompile Include="third_party\libpng\pngpread.c" />
+    <ClCompile Include="third_party\libpng\pngread.c" />
+    <ClCompile Include="third_party\libpng\pngrio.c" />
+    <ClCompile Include="third_party\libpng\pngrtran.c" />
+    <ClCompile Include="third_party\libpng\pngrutil.c" />
+    <ClCompile Include="third_party\libpng\pngset.c" />
+    <ClCompile Include="third_party\libpng\pngtrans.c" />
+    <ClCompile Include="third_party\libpng\pngwio.c" />
+    <ClCompile Include="third_party\libpng\pngwrite.c" />
+    <ClCompile Include="third_party\libpng\pngwtran.c" />
+    <ClCompile Include="third_party\libpng\pngwutil.c" />
+    <ClCompile Include="third_party\zlib\adler32.c" />
+    <ClCompile Include="third_party\zlib\compress.c" />
+    <ClCompile Include="third_party\zlib\crc32.c" />
+    <ClCompile Include="third_party\zlib\deflate.c" />
+    <ClCompile Include="third_party\zlib\gzclose.c" />
+    <ClCompile Include="third_party\zlib\gzlib.c" />
+    <ClCompile Include="third_party\zlib\gzread.c" />
+    <ClCompile Include="third_party\zlib\gzwrite.c" />
+    <ClCompile Include="third_party\zlib\infback.c" />
+    <ClCompile Include="third_party\zlib\inffast.c" />
+    <ClCompile Include="third_party\zlib\inflate.c" />
+    <ClCompile Include="third_party\zlib\inftrees.c" />
+    <ClCompile Include="third_party\zlib\trees.c" />
+    <ClCompile Include="third_party\zlib\uncompr.c" />
+    <ClCompile Include="third_party\zlib\zutil.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+      </Command>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </Intel_OpenCL_Build_Rules>
+    <CustomBuild Include="clguetzli\clguetzli.cu">
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compile.bat</Command>
+      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkObjects>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cu.ptx</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
+      <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <None Include="third_party\libpng\pngwin.def" />
+    <None Include="third_party\zlib\inffas32.asm" />
+    <None Include="third_party\zlib\match32.asm" />
+    <None Include="third_party\zlib\match686.asm" />
+    <None Include="third_party\zlib\zlib.def" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\IntelOpenCL.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 17b8edf4..07f56763 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="guetzli">
@@ -13,21 +13,21 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
-    <Filter Include="third_party\libpng">
-      <UniqueIdentifier>{40be58d6-6dfc-45a3-8ca1-7d1b14051ddc}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="third_party\zlib">
-      <UniqueIdentifier>{cb89c1ac-8399-4814-88f2-4b69576bc9f9}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="third_party\tcmalloc_minimal">
-      <UniqueIdentifier>{f2b475de-6219-478e-9e5e-08f07ef25dbc}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="clguetzli">
-      <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="guetzli\butteraugli_comparator.h">
+    <Filter Include="third_party\libpng">
+      <UniqueIdentifier>{40be58d6-6dfc-45a3-8ca1-7d1b14051ddc}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="third_party\zlib">
+      <UniqueIdentifier>{cb89c1ac-8399-4814-88f2-4b69576bc9f9}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="third_party\tcmalloc_minimal">
+      <UniqueIdentifier>{f2b475de-6219-478e-9e5e-08f07ef25dbc}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="clguetzli">
+      <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="guetzli\butteraugli_comparator.h">
       <Filter>guetzli</Filter>
     </ClInclude>
     <ClInclude Include="guetzli\color_transform.h">
@@ -105,219 +105,219 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
-    <ClInclude Include="third_party\libpng\png.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngconf.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngpriv.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\crc32.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\deflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\gzguts.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffast.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffixed.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inftrees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\trees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zconf.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zlib.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zutil.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\utils.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\ocl.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\clguetzli.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\clguetzli_test.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\clbutter_comparator.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\clguetzli.cl.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-    <ClInclude Include="clguetzli\ocu.h">
-      <Filter>clguetzli</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="guetzli\butteraugli_comparator.cc">
+    <ClInclude Include="third_party\libpng\png.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngconf.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngpriv.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\crc32.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\deflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\gzguts.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffast.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffixed.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inftrees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\trees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zconf.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zlib.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zutil.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\utils.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\ocl.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli_test.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clbutter_comparator.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\clguetzli.cl.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+    <ClInclude Include="clguetzli\ocu.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="guetzli\butteraugli_comparator.cc">
       <Filter>guetzli</Filter>
     </ClCompile>
     <ClCompile Include="guetzli\dct_double.cc">
@@ -380,239 +380,239 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
-    <ClCompile Include="third_party\libpng\png.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngerror.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngget.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngmem.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngpread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngset.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngtrans.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwrite.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\adler32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\compress.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\crc32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\deflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzclose.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzlib.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzread.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzwrite.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\infback.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inffast.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inftrees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\trees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\uncompr.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\zutil.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\utils.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\ocl.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\clguetzli.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\clguetzli_test.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\clguetzli.cl.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\clbutter_comparator.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-    <ClCompile Include="clguetzli\ocu.cpp">
-      <Filter>clguetzli</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="third_party\libpng\pngwin.def">
-      <Filter>third_party\libpng</Filter>
-    </None>
-    <None Include="third_party\zlib\inffas32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match686.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\zlib.def">
-      <Filter>third_party\zlib</Filter>
-    </None>
-  </ItemGroup>
-  <ItemGroup>
-    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
-      <Filter>clguetzli</Filter>
-    </Intel_OpenCL_Build_Rules>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="clguetzli\clguetzli.cu">
-      <Filter>clguetzli</Filter>
-    </CustomBuild>
-  </ItemGroup>
+    <ClCompile Include="third_party\libpng\png.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngerror.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngget.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngmem.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngpread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngset.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngtrans.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwrite.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\adler32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\compress.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\crc32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\deflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzclose.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzlib.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzread.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzwrite.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\infback.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inffast.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inftrees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\trees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\uncompr.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\zutil.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc">
+      <Filter>third_party\tcmalloc_minimal</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\utils.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\ocl.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli_test.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clguetzli.cl.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\clbutter_comparator.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+    <ClCompile Include="clguetzli\ocu.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def">
+      <Filter>third_party\libpng</Filter>
+    </None>
+    <None Include="third_party\zlib\inffas32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match686.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\zlib.def">
+      <Filter>third_party\zlib</Filter>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
+      <Filter>clguetzli</Filter>
+    </Intel_OpenCL_Build_Rules>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="clguetzli\clguetzli.cu">
+      <Filter>clguetzli</Filter>
+    </CustomBuild>
+  </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index 124aea8d..02256e95 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -97,7 +97,7 @@ void ButteraugliComparator::SwitchBlock(int block_x, int block_y,
 }
 
 double ButteraugliComparator::CompareBlock(const OutputImage& img,
-                                           int off_x, int off_y,
+                                           int off_x, int off_y, 
                                            const coeff_t* candidate_block,
                                            const int comp_mask) const {
   int block_x = block_x_ * factor_x_ + off_x;
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index d8937978..587c06d4 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <exception>
@@ -227,7 +226,7 @@ void Usage() {
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
 	  "  --opencl     - Use OpenCL\n"
-	  "  --cuda       - Use CUDA\n"
+	  "  --cuda       - Use CUDA\n"	 
       "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 1666d4fa..35783e41 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -545,7 +545,7 @@ size_t EstimateDCSize(const JPEGData& jpg) {
 
 }  // namespace
 
-void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask,
+void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, const uint8_t comp_mask, 
                                        const double target_mul, bool stop_early)
 {
     const int width = img->width();
@@ -660,7 +660,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
     for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
         for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
             CoeffData * p = &output_order[block_ix * kBlockSize];
-
+   
             candidate_coeff_offsets[block_ix] = candidate_coeffs.size();
             for (int i = 0; i < kBlockSize; i++)
             {
@@ -682,9 +682,9 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
 
 }
 
-void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img,
+void Processor::SelectFrequencyBackEnd(const JPEGData& jpg, OutputImage* img, 
                                         const uint8_t comp_mask,
-                                        const double target_mul,
+                                        const double target_mul, 
                                         bool stop_early,
                                         std::vector<int> &candidate_coeff_offsets,
                                         std::vector<uint8_t>& candidate_coeffs,
diff --git a/guetzli/processor.h b/guetzli/processor.h
index b36b184e..9f2c0c61 100644
--- a/guetzli/processor.h
+++ b/guetzli/processor.h
@@ -30,7 +30,7 @@ struct CoeffData {
     int idx;
     float block_err;
 };
-
+    
 struct Params {
   float butteraugli_target = 1.0;
   bool clear_metadata = true;
diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj
index 44a911b2..1d4d4e3f 100644
--- a/guetzli_static.vcxproj
+++ b/guetzli_static.vcxproj
@@ -93,7 +93,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -110,7 +110,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -127,7 +127,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
@@ -140,7 +140,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
@@ -176,20 +176,20 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
-    <ClInclude Include="third_party\libpng\png.h" />
-    <ClInclude Include="third_party\libpng\pngconf.h" />
-    <ClInclude Include="third_party\libpng\pngpriv.h" />
-    <ClInclude Include="third_party\zlib\crc32.h" />
-    <ClInclude Include="third_party\zlib\deflate.h" />
-    <ClInclude Include="third_party\zlib\gzguts.h" />
-    <ClInclude Include="third_party\zlib\inffast.h" />
-    <ClInclude Include="third_party\zlib\inffixed.h" />
-    <ClInclude Include="third_party\zlib\inflate.h" />
-    <ClInclude Include="third_party\zlib\inftrees.h" />
-    <ClInclude Include="third_party\zlib\trees.h" />
-    <ClInclude Include="third_party\zlib\zconf.h" />
-    <ClInclude Include="third_party\zlib\zlib.h" />
-    <ClInclude Include="third_party\zlib\zutil.h" />
+    <ClInclude Include="third_party\libpng\png.h" />
+    <ClInclude Include="third_party\libpng\pngconf.h" />
+    <ClInclude Include="third_party\libpng\pngpriv.h" />
+    <ClInclude Include="third_party\zlib\crc32.h" />
+    <ClInclude Include="third_party\zlib\deflate.h" />
+    <ClInclude Include="third_party\zlib\gzguts.h" />
+    <ClInclude Include="third_party\zlib\inffast.h" />
+    <ClInclude Include="third_party\zlib\inffixed.h" />
+    <ClInclude Include="third_party\zlib\inflate.h" />
+    <ClInclude Include="third_party\zlib\inftrees.h" />
+    <ClInclude Include="third_party\zlib\trees.h" />
+    <ClInclude Include="third_party\zlib\zconf.h" />
+    <ClInclude Include="third_party\zlib\zlib.h" />
+    <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
@@ -212,43 +212,43 @@
     <ClCompile Include="guetzli\quantize.cc" />
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
-    <ClCompile Include="third_party\libpng\png.c" />
-    <ClCompile Include="third_party\libpng\pngerror.c" />
-    <ClCompile Include="third_party\libpng\pngget.c" />
-    <ClCompile Include="third_party\libpng\pngmem.c" />
-    <ClCompile Include="third_party\libpng\pngpread.c" />
-    <ClCompile Include="third_party\libpng\pngread.c" />
-    <ClCompile Include="third_party\libpng\pngrio.c" />
-    <ClCompile Include="third_party\libpng\pngrtran.c" />
-    <ClCompile Include="third_party\libpng\pngrutil.c" />
-    <ClCompile Include="third_party\libpng\pngset.c" />
-    <ClCompile Include="third_party\libpng\pngtrans.c" />
-    <ClCompile Include="third_party\libpng\pngwio.c" />
-    <ClCompile Include="third_party\libpng\pngwrite.c" />
-    <ClCompile Include="third_party\libpng\pngwtran.c" />
-    <ClCompile Include="third_party\libpng\pngwutil.c" />
-    <ClCompile Include="third_party\zlib\adler32.c" />
-    <ClCompile Include="third_party\zlib\compress.c" />
-    <ClCompile Include="third_party\zlib\crc32.c" />
-    <ClCompile Include="third_party\zlib\deflate.c" />
-    <ClCompile Include="third_party\zlib\gzclose.c" />
-    <ClCompile Include="third_party\zlib\gzlib.c" />
-    <ClCompile Include="third_party\zlib\gzread.c" />
-    <ClCompile Include="third_party\zlib\gzwrite.c" />
-    <ClCompile Include="third_party\zlib\infback.c" />
-    <ClCompile Include="third_party\zlib\inffast.c" />
-    <ClCompile Include="third_party\zlib\inflate.c" />
-    <ClCompile Include="third_party\zlib\inftrees.c" />
-    <ClCompile Include="third_party\zlib\trees.c" />
-    <ClCompile Include="third_party\zlib\uncompr.c" />
-    <ClCompile Include="third_party\zlib\zutil.c" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="third_party\libpng\pngwin.def" />
-    <None Include="third_party\zlib\inffas32.asm" />
-    <None Include="third_party\zlib\match32.asm" />
-    <None Include="third_party\zlib\match686.asm" />
-    <None Include="third_party\zlib\zlib.def" />
+    <ClCompile Include="third_party\libpng\png.c" />
+    <ClCompile Include="third_party\libpng\pngerror.c" />
+    <ClCompile Include="third_party\libpng\pngget.c" />
+    <ClCompile Include="third_party\libpng\pngmem.c" />
+    <ClCompile Include="third_party\libpng\pngpread.c" />
+    <ClCompile Include="third_party\libpng\pngread.c" />
+    <ClCompile Include="third_party\libpng\pngrio.c" />
+    <ClCompile Include="third_party\libpng\pngrtran.c" />
+    <ClCompile Include="third_party\libpng\pngrutil.c" />
+    <ClCompile Include="third_party\libpng\pngset.c" />
+    <ClCompile Include="third_party\libpng\pngtrans.c" />
+    <ClCompile Include="third_party\libpng\pngwio.c" />
+    <ClCompile Include="third_party\libpng\pngwrite.c" />
+    <ClCompile Include="third_party\libpng\pngwtran.c" />
+    <ClCompile Include="third_party\libpng\pngwutil.c" />
+    <ClCompile Include="third_party\zlib\adler32.c" />
+    <ClCompile Include="third_party\zlib\compress.c" />
+    <ClCompile Include="third_party\zlib\crc32.c" />
+    <ClCompile Include="third_party\zlib\deflate.c" />
+    <ClCompile Include="third_party\zlib\gzclose.c" />
+    <ClCompile Include="third_party\zlib\gzlib.c" />
+    <ClCompile Include="third_party\zlib\gzread.c" />
+    <ClCompile Include="third_party\zlib\gzwrite.c" />
+    <ClCompile Include="third_party\zlib\infback.c" />
+    <ClCompile Include="third_party\zlib\inffast.c" />
+    <ClCompile Include="third_party\zlib\inflate.c" />
+    <ClCompile Include="third_party\zlib\inftrees.c" />
+    <ClCompile Include="third_party\zlib\trees.c" />
+    <ClCompile Include="third_party\zlib\uncompr.c" />
+    <ClCompile Include="third_party\zlib\zutil.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def" />
+    <None Include="third_party\zlib\inffas32.asm" />
+    <None Include="third_party\zlib\match32.asm" />
+    <None Include="third_party\zlib\match686.asm" />
+    <None Include="third_party\zlib\zlib.def" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters
index 9362cd94..37876e3d 100644
--- a/guetzli_static.vcxproj.filters
+++ b/guetzli_static.vcxproj.filters
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="guetzli">
@@ -13,12 +13,12 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
-    <Filter Include="third_party\libpng">
-      <UniqueIdentifier>{61f0e3eb-c213-49c5-883a-060bdaf927bb}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="third_party\zlib">
-      <UniqueIdentifier>{ba7b6163-a7d1-4f14-b4b3-3d35f296563a}</UniqueIdentifier>
-    </Filter>
+    <Filter Include="third_party\libpng">
+      <UniqueIdentifier>{61f0e3eb-c213-49c5-883a-060bdaf927bb}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="third_party\zlib">
+      <UniqueIdentifier>{ba7b6163-a7d1-4f14-b4b3-3d35f296563a}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -99,48 +99,48 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
-    <ClInclude Include="third_party\libpng\png.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngconf.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngpriv.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\crc32.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\deflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\gzguts.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffast.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffixed.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inftrees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\trees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zconf.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zlib.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zutil.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
+    <ClInclude Include="third_party\libpng\png.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngconf.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libpng\pngpriv.h">
+      <Filter>third_party\libpng</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\crc32.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\deflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\gzguts.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffast.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inffixed.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inflate.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\inftrees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\trees.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zconf.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zlib.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\zlib\zutil.h">
+      <Filter>third_party\zlib</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -203,112 +203,112 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
-    <ClCompile Include="third_party\libpng\png.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngerror.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngget.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngmem.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngpread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngset.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngtrans.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwrite.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\adler32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\compress.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\crc32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\deflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzclose.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzlib.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzread.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzwrite.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\infback.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inffast.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inftrees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\trees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\uncompr.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\zutil.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="third_party\libpng\pngwin.def">
-      <Filter>third_party\libpng</Filter>
-    </None>
-    <None Include="third_party\zlib\inffas32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match686.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\zlib.def">
-      <Filter>third_party\zlib</Filter>
-    </None>
+    <ClCompile Include="third_party\libpng\png.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngerror.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngget.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngmem.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngpread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngread.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngrutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngset.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngtrans.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwio.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwrite.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwtran.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\libpng\pngwutil.c">
+      <Filter>third_party\libpng</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\adler32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\compress.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\crc32.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\deflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzclose.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzlib.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzread.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\gzwrite.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\infback.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inffast.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inflate.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\inftrees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\trees.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\uncompr.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+    <ClCompile Include="third_party\zlib\zutil.c">
+      <Filter>third_party\zlib</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="third_party\libpng\pngwin.def">
+      <Filter>third_party\libpng</Filter>
+    </None>
+    <None Include="third_party\zlib\inffas32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match32.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\match686.asm">
+      <Filter>third_party\zlib</Filter>
+    </None>
+    <None Include="third_party\zlib\zlib.def">
+      <Filter>third_party\zlib</Filter>
+    </None>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/tests/golden_checksums.txt b/tests/golden_checksums.txt
index 531d0b21..5c09ef45 100644
--- a/tests/golden_checksums.txt
+++ b/tests/golden_checksums.txt
@@ -18,17 +18,17 @@ a9439e530c365a62c965e2d858c8de8fbd3f44e6e9c6ade1c2248ad373fb8755  bicycles.png.g
 9ccdf1be8f0d121d4b6888e47d187e8b6378b63d5592a2b9f001a4d017506bde  blue-rose.jpg.guetzli.jpg
 6c75b537c2d603aa51c9fbc8f7f6b1bff413de269b13eec5982d24cfcf1e0d08  blue-rose.png.guetzli.jpg
 9ccdf1be8f0d121d4b6888e47d187e8b6378b63d5592a2b9f001a4d017506bde  blue-rose-progressive.jpg.guetzli.jpg
-2763977af200403d57c7b9d9d33cfb76947076413c9531b16171c0a5fbc83339  brake-light-420.jpg.guetzli.jpg
-a84703948373a12cc1db1edd32616c983db08ff07f550a3be7b111bf9e4ba06f  brake-light.jpg.guetzli.jpg
+ac979358ac843082f2c4c094dbd757d91816710853f4c6dfaaa8b828cc03db3c  brake-light-420.jpg.guetzli.jpg
+1c1e1754f7a3304eb8e1aada4cf923af1226bfccadc6d69643af6fa7f457ec71  brake-light.jpg.guetzli.jpg
 eda4f537c54ca55eddc03097b6aa57db61c57f80e276306ff4c73b72123e5402  brake-light.png.guetzli.jpg
-a84703948373a12cc1db1edd32616c983db08ff07f550a3be7b111bf9e4ba06f  brake-light-progressive.jpg.guetzli.jpg
+1c1e1754f7a3304eb8e1aada4cf923af1226bfccadc6d69643af6fa7f457ec71  brake-light-progressive.jpg.guetzli.jpg
 c25be9699cad0796f1c663d8a7d92c2e99dfb95bed4efd0d58c15d52bb4fe049  cloth-420.jpg.guetzli.jpg
 6bf2ef7d27a5a8614db0f698b1f977cd0f486fdc2d4260b90f0d64d653383c35  cloth.jpg.guetzli.jpg
-446850f4decd68d77c8a5b09d925fdc9075f44fdfe6dfb2e60a1064677678a3f  cloth.png.guetzli.jpg
+4c98a84823bb60753dbee01e4719215edb326b4b1db51e74f5c1602fe4b51656  cloth.png.guetzli.jpg
 6bf2ef7d27a5a8614db0f698b1f977cd0f486fdc2d4260b90f0d64d653383c35  cloth-progressive.jpg.guetzli.jpg
 e6407f3d38f70dde51584ee174ae29f53cc2d2d7e63812a3405d20079d67a45c  geranium2-420.jpg.guetzli.jpg
 14fb9aab1ebd6d7b8779566665fe2f94b07158b277f29296ef2f9fee71c3c4a4  geranium2.jpg.guetzli.jpg
-b7ded98029eca0ecf75bb01d5ea54a833fa13377136b4e7e404ae35503f82eb8  geranium2.png.guetzli.jpg
+d857d546a49e9e1c59f86be86656a87fcbfbbf77adc2abe39ca504c4a674e7b7  geranium2.png.guetzli.jpg
 14fb9aab1ebd6d7b8779566665fe2f94b07158b277f29296ef2f9fee71c3c4a4  geranium2-progressive.jpg.guetzli.jpg
 9eea5d54068ccaacfb1839c67c401685646c73edfdc38bac8e1e3a084e268f0d  geranium-420.jpg.guetzli.jpg
 4f249d42280d6f982fa093343236b318be887dc0f2241e125fcf6d4c913305e3  geranium.jpg.guetzli.jpg
@@ -39,12 +39,12 @@ e1fbdb05fe74f2d78cf6547621d99afea6e72069d8de68da274d66530b5dcdd7  green.jpg.guet
 3df6e963406121db078b99653d7c4e49ce2affe99b31212b026239d082748291  green.png.guetzli.jpg
 e1fbdb05fe74f2d78cf6547621d99afea6e72069d8de68da274d66530b5dcdd7  green-progressive.jpg.guetzli.jpg
 c2fcd25260b5c52871def4a7ef0136be7e7e7f63f836a974c51a4681a651d7c7  green-rose-420.jpg.guetzli.jpg
-90998e98318bb62538fe64f3b60d3100230474bf57dda72cd737eeee8ea482ae  green-rose.jpg.guetzli.jpg
+8cf041993b4ba59d5dd478ce4171a48ec8335301400de34286541e5a1769622c  green-rose.jpg.guetzli.jpg
 513e03accb79e60e9c8a2e9832bdbf9f1af8b23c6905cdb476e0626e1a7009d2  green-rose.png.guetzli.jpg
-90998e98318bb62538fe64f3b60d3100230474bf57dda72cd737eeee8ea482ae  green-rose-progressive.jpg.guetzli.jpg
+8cf041993b4ba59d5dd478ce4171a48ec8335301400de34286541e5a1769622c  green-rose-progressive.jpg.guetzli.jpg
 f1279ca9177e0aea7451bafa4abcd8ecfdf8a939ae97c974fbc802b668d8a56b  hand-420.jpg.guetzli.jpg
 8d2d8f4a95deea2dca8539a0c12ba8186ea93e7d9bcff9cb2c0bb9eab5504d1f  hand.jpg.guetzli.jpg
-4d156e4dbec82cb2f8fa324ea9f9142327d63853379cf3332775fdd40bbafc2f  hand.png.guetzli.jpg
+a71c821561c30d55fc4d83cfc7cb55f77ba6095749eafa38491b619c30b43571  hand.png.guetzli.jpg
 8d2d8f4a95deea2dca8539a0c12ba8186ea93e7d9bcff9cb2c0bb9eab5504d1f  hand-progressive.jpg.guetzli.jpg
 c3a3f86da0eeacc015504139181c19f874e9631336bb5f90fbf8a367058ec95f  lichen-420.jpg.guetzli.jpg
 44db143ce962b2eb45fcc1a79468d96ca7c677cba864efd9a984e3f86de5a0a5  lichen.jpg.guetzli.jpg
@@ -70,7 +70,7 @@ aceb338115241d9984510fd2e8a2bf46b3c5fc431e827a2d1efe496dff038675  port-420.jpg.g
 02aac145b6df57db2913952d3d46c8d456e2f000cff1ff4bfc574b27175335e0  port.jpg.guetzli.jpg
 157a25812bcf7bce343fe6c6a88932ee49117b46b5d3ba0aa3421edc8f2f1a09  port.png.guetzli.jpg
 02aac145b6df57db2913952d3d46c8d456e2f000cff1ff4bfc574b27175335e0  port-progressive.jpg.guetzli.jpg
-f41f613ecfae42d050115b785c1591724fcb7937c361b9c5d2a3248b7580953f  rainbow-420.jpg.guetzli.jpg
+c98795b77f49833c4e75bd58f78d1e58c0b7ca95990f8006247feb6e230c7f5d  rainbow-420.jpg.guetzli.jpg
 74d94a13c52b0d582c50d6bc70cecb6762c08740db6c234dff9b0e1c04fccbb5  rainbow.jpg.guetzli.jpg
 657efb5cfa742fbdfd6304703b131a63c2ddf8b686600840a800e7d94b4da0eb  rainbow.png.guetzli.jpg
 74d94a13c52b0d582c50d6bc70cecb6762c08740db6c234dff9b0e1c04fccbb5  rainbow-progressive.jpg.guetzli.jpg
@@ -86,7 +86,7 @@ c5499fdc97b3ae02d77ea12140d6da8ad645406e66adc88250a3c980bb70fe7d  red-rose-420.j
 f9a97e475af9127ea6d6d4d41fec52330ca075aae707185d90910fe198695e8d  red-rose.jpg.guetzli.jpg
 22f21955e7078745d03c1eb1985b8c5ffbd0b615870071a821102c44bd94af97  red-rose.png.guetzli.jpg
 f9a97e475af9127ea6d6d4d41fec52330ca075aae707185d90910fe198695e8d  red-rose-progressive.jpg.guetzli.jpg
-4df6d9b244c2d02cacff35ada998da3be13d1c9f5e42d4a2ab9b4725fc78dfa5  rgb-420.jpg.guetzli.jpg
+22cc4f5431c339e67958870a07c6ebc12fefdc849038230c9b8b98eac7f384ba  rgb-420.jpg.guetzli.jpg
 19256b30557be9dc6a7effe6418f2c1ba6e624940ef1f41c0ca71e356963014c  rgb.jpg.guetzli.jpg
 c1f8e4161a8b6baddea1d279f4d490670560d9c5d1161b66ee101c4250d8dd48  rgb.png.guetzli.jpg
 19256b30557be9dc6a7effe6418f2c1ba6e624940ef1f41c0ca71e356963014c  rgb-progressive.jpg.guetzli.jpg

From e5efe988858480e2ed144e57f35e0b69e8393728 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 2 Jun 2017 17:29:50 +0800
Subject: [PATCH 123/189] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=E6=B5=81=E7=A8=8B=EF=BC=8C=E4=BD=86=E8=AE=A1=E7=AE=97=E7=BB=93?=
 =?UTF-8?q?=E6=9E=9C=E8=BF=98=E9=9C=80=E8=A6=81=E6=A0=A1=E6=AD=A3=20cuDiff?=
 =?UTF-8?q?mapOpsinDynamicsImage=20cuComputeBlockZeroingOrder=20cuMask?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp |  23 ++
 clguetzli/clguetzli.cpp           | 634 +++++++++++++++++++++++++++++-
 clguetzli/ocu.cpp                 |   2 +-
 guetzli/processor.cc              |  38 +-
 4 files changed, 669 insertions(+), 28 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 64c0a3dd..53cd89fb 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -20,6 +20,12 @@ namespace butteraugli
             clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
+        else if (g_useCuda && xsize_ > 100 && ysize_ > 100)
+        {
+            result.resize(xsize_ * ysize_);
+            clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
+        }
         else
         {
             ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result);
@@ -171,6 +177,23 @@ namespace butteraugli
                 );
             return;
         }
+        else if (g_useCuda && xsize > 100 && ysize > 100)
+        {
+            mask->resize(3);
+            mask_dc->resize(3);
+            for (int i = 0; i < 3; i++)
+            {
+                (*mask)[i].resize(xsize * ysize);
+                (*mask_dc)[i].resize(xsize * ysize);
+            }
+            cuMask((*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data(),
+                xsize, ysize,
+                xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+                xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
+            );
+            return;
+        }
 
         _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 7b2cd995..b0e9fefe 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -97,8 +97,8 @@ void clDiffmapOpsinDynamicsImage(
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-    cl_int channel_size = xsize * ysize * sizeof(float);
-    cl_int channel_step_size = res_xsize * res_ysize * sizeof(float);
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
 
     cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
@@ -180,16 +180,13 @@ void clComputeBlockZeroingOrder(
     cl_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
 
     int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
-    cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size);
+    cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
     cl_float clBlockErrorLimit = BlockErrorLimit;
     cl_int clWidth = image_width;
     cl_int clHeight = image_height;
     cl_int clFactor = factor;
     cl_int clMask = comp_mask;
 
-	clEnqueueWriteBuffer(ocl.commandQueue, mem_output_order_batch, CL_FALSE, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
-
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
@@ -250,7 +247,7 @@ void clMask(
     cl_int err = CL_SUCCESS;
     ocl_args_d_t &ocl = getOcl();
 
-    cl_int channel_size = xsize * ysize * sizeof(float);
+    size_t channel_size = xsize * ysize * sizeof(float);
 
     ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
     ocl_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
@@ -502,7 +499,7 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 {
 	static const double kSigma = 1.1;
 
-	cl_int channel_size = xsize * ysize * sizeof(float);
+	size_t channel_size = xsize * ysize * sizeof(float);
 
 	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
@@ -541,7 +538,7 @@ void clMaskHighIntensityChangeEx(
     ocl_channels &xyb1/*in,out*/,
     const size_t xsize, const size_t ysize)
 {
-	cl_int channel_size = xsize * ysize * sizeof(float);
+	size_t channel_size = xsize * ysize * sizeof(float);
 
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -592,7 +589,7 @@ void clEdgeDetectorMapEx(
     const ocl_channels &rgb, const ocl_channels &rgb2, 
     const size_t xsize, const size_t ysize, const size_t step)
 {
-	cl_int channel_size = xsize * ysize * sizeof(float);
+	size_t channel_size = xsize * ysize * sizeof(float);
 
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
@@ -690,7 +687,7 @@ void clEdgeDetectorLowFreqEx(
     const ocl_channels &rgb, const ocl_channels &rgb2,
 	const size_t xsize, const size_t ysize, const size_t step)
 {
-	cl_int channel_size = xsize * ysize * sizeof(float);
+	size_t channel_size = xsize * ysize * sizeof(float);
 
 	static const double kSigma = 14;
 
@@ -885,9 +882,6 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clxsize = xsize;
-	cl_int clysize = ysize;
-
 	double extmul = 0.975741017749;
 	double extoff = -4.25328244168;
 	double offset = 0.454909521427;
@@ -1218,6 +1212,490 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
     ocu.releaseMemChannels(rgb);
 }
 
+void cuMaskHighIntensityChangeEx(
+    ocu_channels &xyb0/*in,out*/,
+    ocu_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+
+    ocu_channels c0 = ocl.allocMemChannels(channel_size);
+    ocu_channels c1 = ocl.allocMemChannels(channel_size);
+
+    cuMemcpyDtoD(c0.r, xyb0.r, channel_size);
+    cuMemcpyDtoD(c0.g, xyb0.g, channel_size);
+    cuMemcpyDtoD(c0.b, xyb0.b, channel_size);
+    cuMemcpyDtoD(c1.r, xyb1.r, channel_size);
+    cuMemcpyDtoD(c1.g, xyb1.g, channel_size);
+    cuMemcpyDtoD(c1.b, xyb1.b, channel_size);
+
+    const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b,
+                           &xyb1.r, &xyb1.g, &xyb1.b,
+                           &c0.r, &c0.g, &c0.b,
+                           &c1.r, &c1.g, &c1.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(c0);
+    ocl.releaseMemChannels(c1);
+}
+
+void cuEdgeDetectorMapEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+
+    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+    static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+    }
+
+    const void *args[] = { &result,
+                           &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+                           &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+                           &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blured);
+    ocl.releaseMemChannels(rgb2_blured);
+}
+
+void cuBlockDiffMapEx(
+    CUdeviceptr block_diff_dc/*out*/,
+    CUdeviceptr block_diff_ac/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &block_diff_dc, &block_diff_ac,
+                           &rgb.r, &rgb.g, &rgb.b,
+                           &rgb2.r, &rgb2.g, &rgb2.b,
+                           &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuEdgeDetectorLowFreqEx(
+    CUdeviceptr block_diff_ac/*in,out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    static const double kSigma = 14;
+
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
+    }
+
+    const void *args[] = { &block_diff_ac,
+                           &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+                           &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+                           &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blured);
+    ocl.releaseMemChannels(rgb2_blured);
+}
+
+void cuDiffPrecomputeEx(
+    ocu_channels &mask/*out*/,
+    const ocu_channels &xyb0, const ocu_channels &xyb1,
+    const size_t xsize, const size_t ysize)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &mask.x, &mask.y, &mask.b,
+                           &xyb0.x, &xyb0.y, &xyb0.b, 
+                           &xyb1.x, &xyb1.y, &xyb1.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &img, &w };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE],
+        size, 1, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize)
+{
+    if (xsize < 4 || ysize < 4) {
+        // TODO: Make this work for small dimensions as well.
+        return;
+    }
+
+    ocu_args_d_t &ocl = getOcu();
+
+    size_t len = xsize * ysize * sizeof(float);
+    CUdeviceptr img_org = ocl.allocMem(len);
+
+    cuMemcpyDtoD(img_org, img, len);
+
+    const void *args[] = { &img, &img_org};
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemFree(img_org);
+}
+
+void cuMinSquareValEx(
+    CUdeviceptr img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize);
+
+    const void *args[] = { &srcA, &img, &square_size, &offset};
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize);
+    cuMemFree(srcA);
+}
+
+void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    double extmul = 0.975741017749;
+    double extoff = -4.25328244168;
+    double offset = 0.454909521427;
+    double scaler = 0.0738288224836;
+    double mul = 20.8029176447;
+    static double lut_x[512];
+    static bool lutx_init = false;
+    if (!lutx_init)
+    {
+        lutx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+    }
+
+    extmul = 0.373995618954;
+    extoff = 1.5307267433;
+    offset = 0.911952641929;
+    scaler = 1.1731667845;
+    mul = 16.2447033988;
+    static double lut_y[512];
+    static bool luty_init = false;
+    if (!luty_init)
+    {
+        luty_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+    }
+
+    extmul = 0.61582234137;
+    extoff = -4.25376118646;
+    offset = 1.05105070921;
+    scaler = 0.47434643535;
+    mul = 31.1444967089;
+    static double lut_b[512];
+    static bool lutb_init = false;
+    if (!lutb_init)
+    {
+        lutb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+    }
+
+    extmul = 1.79116943438;
+    extoff = -3.86797479189;
+    offset = 0.670960225853;
+    scaler = 0.486575865525;
+    mul = 20.4563479139;
+    static double lut_dcx[512];
+    static bool lutdcx_init = false;
+    if (!lutdcx_init)
+    {
+        lutdcx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+    }
+
+    extmul = 0.212223514236;
+    extoff = -3.65647120524;
+    offset = 1.73396799447;
+    scaler = 0.170392660501;
+    mul = 21.6566724788;
+    static double lut_dcy[512];
+    static bool lutdcy_init = false;
+    if (!lutdcy_init)
+    {
+        lutdcy_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+    }
+
+    extmul = 0.349376011816;
+    extoff = -0.894711072781;
+    offset = 0.901647926679;
+    scaler = 0.380086095024;
+    mul = 18.0373825149;
+    static double lut_dcb[512];
+    static bool lutdcb_init = false;
+    if (!lutdcb_init)
+    {
+        lutdcb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+    }
+
+    size_t channel_size = 512 * 3 * sizeof(double);
+    ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
+    ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
+
+    const void *args[] = { &mask.r, &mask.g, &mask.b,
+                           &mask_dc.r, &mask_dc.g, &mask_dc.b, 
+                           &xyb.x, &xyb.y, &xyb.b,
+                           &xyb_dc.x, &xyb_dc.y, &xyb_dc.b};
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK],
+            xsize, ysize, 1,
+            1, 1, 1,
+            0,
+            ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(xyb);
+    ocl.releaseMemChannels(xyb_dc);
+}
+
+void cuMaskEx(
+    ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize)
+{
+    cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize);
+    for (int i = 0; i < 3; i++)
+    {
+        cuAverage5x5Ex(mask.ch[i], xsize, ysize);
+        cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0);
+
+        static const double sigma[3] = {
+            9.65781083553,
+            14.2644604355,
+            4.53358927369,
+        };
+
+        cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
+    }
+
+    cuDoMask(mask, mask_dc, xsize, ysize);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+        cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+    }
+}
+
+void cuCombineChannelsEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &mask,
+    const ocu_channels &mask_dc,
+    const size_t xsize, const size_t ysize,
+    const CUdeviceptr block_diff_dc,
+    const CUdeviceptr block_diff_ac,
+    const CUdeviceptr edge_detector_map,
+    const size_t res_xsize,
+    const size_t step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
+    const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
+
+    const void *args[] = { &result,
+                           &mask.r, &mask.g, &mask.b,
+                           &mask_dc.r, &mask_dc.g, &mask_dc.b,
+                           &xsize, &ysize,
+                           &block_diff_dc, &block_diff_ac, &edge_detector_map,
+                           &res_xsize,
+                           &step };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS],
+        work_xsize, work_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
+
+    const void *args[] = { &diffmap_out,
+                           &diffmap,
+                           &xsize, &ysize,
+                           &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
+
+    cuMemFree(diffmap_out);
+}
+
+void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    int cls = 8 - step;
+    int cls2 = (8 - step) / 2;
+
+    const void *args[] = { &out,
+                           &in,
+                           &xsize,
+                           &cls,
+                           &cls2 };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER],
+        xsize - cls, ysize - cls, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    int cls = 8 - step;
+    int cls2 = (8 - step) / 2;
+
+    const void *args[] = { &out,
+        &cls,
+        &cls2,
+        &in};
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
+{
+    cuUpsampleSquareRootEx(diffmap, xsize, ysize, step);
+
+    static const double kSigma = 8.8510880283;
+    static const double mul1 = 24.8235314874;
+    static const double scale = 1.0 / (1.0 + mul1);
+
+    const int s = 8 - step;
+    int s2 = (8 - step) / 2;
+
+    ocu_args_d_t &ocl = getOcu();
+    CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+    cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
+
+    static const double border_ratio = 0.03027655136;
+    cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
+
+    cuAddBorderEx(diffmap, xsize, ysize, step, blurred);
+    cuScaleImageEx(diffmap, xsize * ysize, scale);
+
+    cuMemFree(blurred);
+}
+
 void cuDiffmapOpsinDynamicsImage(
     float* result,
     const float* r, const float* g, const float* b,
@@ -1225,7 +1703,50 @@ void cuDiffmapOpsinDynamicsImage(
     const size_t xsize, const size_t ysize,
     const size_t step)
 {
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+    CUdeviceptr mem_result = ocl.allocMem(channel_size, result);
+
+    CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size);
+
+    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocu_channels mask = ocl.allocMemChannels(channel_size);
+        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
+
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
 
+    cuCalculateDiffmapEx(mem_result, xsize, ysize, step);
+
+    cuMemcpyDtoH(result, mem_result, channel_size);
+
+    ocl.releaseMemChannels(xyb1);
+    ocl.releaseMemChannels(xyb0);
+
+    cuMemFree(edge_detector_map);
+    cuMemFree(block_diff_dc);
+    cuMemFree(block_diff_ac);
+
+    cuMemFree(mem_result);
 }
 
 void cuComputeBlockZeroingOrder(
@@ -1240,7 +1761,67 @@ void cuComputeBlockZeroingOrder(
     const int comp_mask,
     const float BlockErrorLimit)
 {
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
+
+    using namespace guetzli;
+
+    cl_int err = 0;
+    ocu_args_d_t &ocl = getOcu();
+
+    CUdeviceptr mem_orig_coeff[3];
+    CUdeviceptr mem_mayout_coeff[3];
+    CUdeviceptr mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+
+        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+    }
+    CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
+
+    const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], 
+                           &mem_orig_image, &mem_orig_image, &mem_mask_scale,
+                           &image_width, &image_height,
+                           &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
+                           &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
+                           &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
+                           &factor,
+                           &comp_mask,
+                           &BlockErrorLimit,
+                           &mem_output_order_batch};
+
+    err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER],
+        blockf_width, blockf_height, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
+
+    for (int c = 0; c < 3; c++)
+    {
+        cuMemFree(mem_orig_coeff[c]);
+        cuMemFree(mem_mayout_coeff[c]);
+        cuMemFree(mem_mayout_pixel[c]);
+
+    }
 
+    cuMemFree(mem_orig_image);
+    cuMemFree(mem_mask_scale);
+    cuMemFree(mem_output_order_batch);
 }
 
 void cuMask(
@@ -1250,7 +1831,29 @@ void cuMask(
     const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2)
 {
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocl = getOcu();
 
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+    ocu_channels mask = ocl.allocMemChannels(channel_size);
+    ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+
+    cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
+
+    cuMemcpyDtoH(mask_r, mask.r, channel_size);
+    cuMemcpyDtoH(mask_g, mask.r, channel_size);
+    cuMemcpyDtoH(mask_b, mask.r, channel_size);
+    cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
+    cuMemcpyDtoH(maskdc_g, mask_dc.r, channel_size);
+    cuMemcpyDtoH(maskdc_b, mask_dc.r, channel_size);
+
+    ocl.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb2);
+    ocl.releaseMemChannels(mask);
+    ocl.releaseMemChannels(mask_dc);
 }
 
 void cuConvolutionXEx(
@@ -1259,12 +1862,11 @@ void cuConvolutionXEx(
     const CUdeviceptr multipliers, size_t len,
     int xstep, int offset, double border_ratio)
 {
-    CUresult err = CUDA_SUCCESS;
     ocu_args_d_t &ocu = getOcu();
 
     const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
+    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
         xsize, ysize, 1,
         1, 1, 1,
         0,
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 3a263c3d..c99d6ea9 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -117,7 +117,7 @@ CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init)
     cuMemAlloc(&mem, s);
     if (init)
     {
-        cuMemcpyHtoDAsync(mem, init, s, this->stream);
+        cuMemcpyHtoD(mem, init, s);
     }
     else
     {
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 35783e41..4690aff1 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -567,7 +567,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
     CoeffData * output_order = NULL;
     ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
 
-    if (g_useOpenCL || g_checkOpenCL)
+    if (g_useOpenCL || g_useCuda || g_checkOpenCL)
     {
         channel_info orig_channel[3];
         channel_info mayout_channel[3];
@@ -588,16 +588,32 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         output_order_gpu.resize(num_blocks * kBlockSize);
         output_order = output_order_gpu.data();
 
-        clComputeBlockZeroingOrder(output_order,
-                                    orig_channel,
-                                    comp->imgOpsinDynamicsBlockList.data(),
-                                    comp->imgMaskXyzScaleBlockList.data(),
-                                    width,
-                                    height,
-                                    mayout_channel,
-                                    factor_x,
-                                    comp_mask,
-                                    comp->BlockErrorLimit());
+        if (g_useCuda)
+        {
+            clComputeBlockZeroingOrder(output_order,
+                orig_channel,
+                comp->imgOpsinDynamicsBlockList.data(),
+                comp->imgMaskXyzScaleBlockList.data(),
+                width,
+                height,
+                mayout_channel,
+                factor_x,
+                comp_mask,
+                comp->BlockErrorLimit());
+        }
+        else
+        {
+            clComputeBlockZeroingOrder(output_order,
+                orig_channel,
+                comp->imgOpsinDynamicsBlockList.data(),
+                comp->imgMaskXyzScaleBlockList.data(),
+                width,
+                height,
+                mayout_channel,
+                factor_x,
+                comp_mask,
+                comp->BlockErrorLimit());
+        }
 
     }
     if (!g_useOpenCL || g_checkOpenCL)

From 9a6a17cc54374b31fbe952d411771980cd94a27c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 2 Jun 2017 17:32:41 +0800
Subject: [PATCH 124/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20cuMask=20=E8=AE=A1?=
 =?UTF-8?q?=E7=AE=97=E7=BB=93=E6=9E=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index b0e9fefe..4a49ef1e 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1844,11 +1844,11 @@ void cuMask(
     cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
 
     cuMemcpyDtoH(mask_r, mask.r, channel_size);
-    cuMemcpyDtoH(mask_g, mask.r, channel_size);
-    cuMemcpyDtoH(mask_b, mask.r, channel_size);
+    cuMemcpyDtoH(mask_g, mask.g, channel_size);
+    cuMemcpyDtoH(mask_b, mask.b, channel_size);
     cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
-    cuMemcpyDtoH(maskdc_g, mask_dc.r, channel_size);
-    cuMemcpyDtoH(maskdc_b, mask_dc.r, channel_size);
+    cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size);
+    cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size);
 
     ocl.releaseMemChannels(rgb);
     ocl.releaseMemChannels(rgb2);

From 3345026b39685108d524b4d51c68e20f34f80e28 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 3 Jun 2017 00:04:54 +0800
Subject: [PATCH 125/189] =?UTF-8?q?=E8=B0=83=E6=95=B4cu=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=E7=BB=93=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp | 790 +--------------------------------------
 clguetzli/clguetzli.h   |  37 +-
 clguetzli/cuguetzli.cpp | 801 ++++++++++++++++++++++++++++++++++++++++
 clguetzli/cuguetzli.h   |  37 ++
 guetzli.vcxproj         |   2 +
 guetzli.vcxproj.filters |   6 +
 guetzli/processor.cc    |   2 +-
 7 files changed, 850 insertions(+), 825 deletions(-)
 create mode 100644 clguetzli/cuguetzli.cpp
 create mode 100644 clguetzli/cuguetzli.h

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 4a49ef1e..f50ce17c 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -2,7 +2,6 @@
 #include <math.h>
 #include <algorithm>
 #include <vector>
-#include "ocu.h"
 
 extern bool g_useOpenCL = false;
 extern bool g_useCuda = false;
@@ -73,7 +72,6 @@ void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
 {
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
     ocl_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
 
@@ -82,7 +80,7 @@ void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
     clEnqueueReadBuffer(ocl.commandQueue, rgb.r, false, 0, channel_size, r, 0, NULL, NULL);
     clEnqueueReadBuffer(ocl.commandQueue, rgb.g, false, 0, channel_size, g, 0, NULL, NULL);
     clEnqueueReadBuffer(ocl.commandQueue, rgb.b, false, 0, channel_size, b, 0, NULL, NULL);
-    err = clFinish(ocl.commandQueue);
+    clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
 }
@@ -1193,789 +1191,3 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 
 	clReleaseMemObject(blurred);
 }
-
-//////////////////////////////////////////////////////////////////////////////////////
-void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
-{
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
-
-    cuOpsinDynamicsImageEx(rgb, xsize, ysize);
-
-    cuMemcpyDtoH(r, rgb.r, channel_size);
-    cuMemcpyDtoH(g, rgb.g, channel_size);
-    cuMemcpyDtoH(b, rgb.b, channel_size);
-
-    ocu.releaseMemChannels(rgb);
-}
-
-void cuMaskHighIntensityChangeEx(
-    ocu_channels &xyb0/*in,out*/,
-    ocu_channels &xyb1/*in,out*/,
-    const size_t xsize, const size_t ysize)
-{
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    ocu_args_d_t &ocl = getOcu();
-
-    ocu_channels c0 = ocl.allocMemChannels(channel_size);
-    ocu_channels c1 = ocl.allocMemChannels(channel_size);
-
-    cuMemcpyDtoD(c0.r, xyb0.r, channel_size);
-    cuMemcpyDtoD(c0.g, xyb0.g, channel_size);
-    cuMemcpyDtoD(c0.b, xyb0.b, channel_size);
-    cuMemcpyDtoD(c1.r, xyb1.r, channel_size);
-    cuMemcpyDtoD(c1.g, xyb1.g, channel_size);
-    cuMemcpyDtoD(c1.b, xyb1.b, channel_size);
-
-    const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b,
-                           &xyb1.r, &xyb1.g, &xyb1.b,
-                           &c0.r, &c0.g, &c0.b,
-                           &c1.r, &c1.g, &c1.b };
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    ocl.releaseMemChannels(c0);
-    ocl.releaseMemChannels(c1);
-}
-
-void cuEdgeDetectorMapEx(
-    CUdeviceptr result/*out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step)
-{
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    ocu_args_d_t &ocl = getOcu();
-
-    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
-    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
-
-    static const double kSigma[3] = { 1.5, 0.586, 0.4 };
-
-    for (int i = 0; i < 3; i++)
-    {
-        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
-        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
-    }
-
-    const void *args[] = { &result,
-                           &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
-                           &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
-                           &xsize, &ysize, &step };
-
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR],
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    ocl.releaseMemChannels(rgb_blured);
-    ocl.releaseMemChannels(rgb2_blured);
-}
-
-void cuBlockDiffMapEx(
-    CUdeviceptr block_diff_dc/*out*/,
-    CUdeviceptr block_diff_ac/*out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    const void *args[] = { &block_diff_dc, &block_diff_ac,
-                           &rgb.r, &rgb.g, &rgb.b,
-                           &rgb2.r, &rgb2.g, &rgb2.b,
-                           &xsize, &ysize, &step };
-
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP],
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-}
-
-void cuEdgeDetectorLowFreqEx(
-    CUdeviceptr block_diff_ac/*in,out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step)
-{
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    static const double kSigma = 14;
-
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
-    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
-
-    for (int i = 0; i < 3; i++)
-    {
-        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
-        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
-    }
-
-    const void *args[] = { &block_diff_ac,
-                           &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
-                           &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
-                           &xsize, &ysize, &step };
-
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ],
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    ocl.releaseMemChannels(rgb_blured);
-    ocl.releaseMemChannels(rgb2_blured);
-}
-
-void cuDiffPrecomputeEx(
-    ocu_channels &mask/*out*/,
-    const ocu_channels &xyb0, const ocu_channels &xyb1,
-    const size_t xsize, const size_t ysize)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    const void *args[] = { &mask.x, &mask.y, &mask.b,
-                           &xyb0.x, &xyb0.y, &xyb0.b, 
-                           &xyb1.x, &xyb1.y, &xyb1.b };
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-}
-
-void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    const void *args[] = { &img, &w };
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE],
-        size, 1, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-}
-
-void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize)
-{
-    if (xsize < 4 || ysize < 4) {
-        // TODO: Make this work for small dimensions as well.
-        return;
-    }
-
-    ocu_args_d_t &ocl = getOcu();
-
-    size_t len = xsize * ysize * sizeof(float);
-    CUdeviceptr img_org = ocl.allocMem(len);
-
-    cuMemcpyDtoD(img_org, img, len);
-
-    const void *args[] = { &img, &img_org};
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    cuMemFree(img_org);
-}
-
-void cuMinSquareValEx(
-    CUdeviceptr img/*in,out*/,
-    const size_t xsize, const size_t ysize,
-    const size_t square_size, const size_t offset)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize);
-
-    const void *args[] = { &srcA, &img, &square_size, &offset};
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize);
-    cuMemFree(srcA);
-}
-
-void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    double extmul = 0.975741017749;
-    double extoff = -4.25328244168;
-    double offset = 0.454909521427;
-    double scaler = 0.0738288224836;
-    double mul = 20.8029176447;
-    static double lut_x[512];
-    static bool lutx_init = false;
-    if (!lutx_init)
-    {
-        lutx_init = true;
-        MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
-    }
-
-    extmul = 0.373995618954;
-    extoff = 1.5307267433;
-    offset = 0.911952641929;
-    scaler = 1.1731667845;
-    mul = 16.2447033988;
-    static double lut_y[512];
-    static bool luty_init = false;
-    if (!luty_init)
-    {
-        luty_init = true;
-        MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
-    }
-
-    extmul = 0.61582234137;
-    extoff = -4.25376118646;
-    offset = 1.05105070921;
-    scaler = 0.47434643535;
-    mul = 31.1444967089;
-    static double lut_b[512];
-    static bool lutb_init = false;
-    if (!lutb_init)
-    {
-        lutb_init = true;
-        MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
-    }
-
-    extmul = 1.79116943438;
-    extoff = -3.86797479189;
-    offset = 0.670960225853;
-    scaler = 0.486575865525;
-    mul = 20.4563479139;
-    static double lut_dcx[512];
-    static bool lutdcx_init = false;
-    if (!lutdcx_init)
-    {
-        lutdcx_init = true;
-        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
-    }
-
-    extmul = 0.212223514236;
-    extoff = -3.65647120524;
-    offset = 1.73396799447;
-    scaler = 0.170392660501;
-    mul = 21.6566724788;
-    static double lut_dcy[512];
-    static bool lutdcy_init = false;
-    if (!lutdcy_init)
-    {
-        lutdcy_init = true;
-        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
-    }
-
-    extmul = 0.349376011816;
-    extoff = -0.894711072781;
-    offset = 0.901647926679;
-    scaler = 0.380086095024;
-    mul = 18.0373825149;
-    static double lut_dcb[512];
-    static bool lutdcb_init = false;
-    if (!lutdcb_init)
-    {
-        lutdcb_init = true;
-        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
-    }
-
-    size_t channel_size = 512 * 3 * sizeof(double);
-    ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
-    ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
-
-    const void *args[] = { &mask.r, &mask.g, &mask.b,
-                           &mask_dc.r, &mask_dc.g, &mask_dc.b, 
-                           &xyb.x, &xyb.y, &xyb.b,
-                           &xyb_dc.x, &xyb_dc.y, &xyb_dc.b};
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK],
-            xsize, ysize, 1,
-            1, 1, 1,
-            0,
-            ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    ocl.releaseMemChannels(xyb);
-    ocl.releaseMemChannels(xyb_dc);
-}
-
-void cuMaskEx(
-    ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize)
-{
-    cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize);
-    for (int i = 0; i < 3; i++)
-    {
-        cuAverage5x5Ex(mask.ch[i], xsize, ysize);
-        cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0);
-
-        static const double sigma[3] = {
-            9.65781083553,
-            14.2644604355,
-            4.53358927369,
-        };
-
-        cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
-    }
-
-    cuDoMask(mask, mask_dc, xsize, ysize);
-
-    for (int i = 0; i < 3; i++)
-    {
-        cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
-        cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
-    }
-}
-
-void cuCombineChannelsEx(
-    CUdeviceptr result/*out*/,
-    const ocu_channels &mask,
-    const ocu_channels &mask_dc,
-    const size_t xsize, const size_t ysize,
-    const CUdeviceptr block_diff_dc,
-    const CUdeviceptr block_diff_ac,
-    const CUdeviceptr edge_detector_map,
-    const size_t res_xsize,
-    const size_t step)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
-    const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
-
-    const void *args[] = { &result,
-                           &mask.r, &mask.g, &mask.b,
-                           &mask_dc.r, &mask_dc.g, &mask_dc.b,
-                           &xsize, &ysize,
-                           &block_diff_dc, &block_diff_ac, &edge_detector_map,
-                           &res_xsize,
-                           &step };
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS],
-        work_xsize, work_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-}
-
-void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
-
-    const void *args[] = { &diffmap_out,
-                           &diffmap,
-                           &xsize, &ysize,
-                           &step };
-
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT],
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
-
-    cuMemFree(diffmap_out);
-}
-
-void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    int cls = 8 - step;
-    int cls2 = (8 - step) / 2;
-
-    const void *args[] = { &out,
-                           &in,
-                           &xsize,
-                           &cls,
-                           &cls2 };
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER],
-        xsize - cls, ysize - cls, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-}
-
-void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    int cls = 8 - step;
-    int cls2 = (8 - step) / 2;
-
-    const void *args[] = { &out,
-        &cls,
-        &cls2,
-        &in};
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-}
-
-void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
-{
-    cuUpsampleSquareRootEx(diffmap, xsize, ysize, step);
-
-    static const double kSigma = 8.8510880283;
-    static const double mul1 = 24.8235314874;
-    static const double scale = 1.0 / (1.0 + mul1);
-
-    const int s = 8 - step;
-    int s2 = (8 - step) / 2;
-
-    ocu_args_d_t &ocl = getOcu();
-    CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
-    cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
-
-    static const double border_ratio = 0.03027655136;
-    cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
-
-    cuAddBorderEx(diffmap, xsize, ysize, step, blurred);
-    cuScaleImageEx(diffmap, xsize * ysize, scale);
-
-    cuMemFree(blurred);
-}
-
-void cuDiffmapOpsinDynamicsImage(
-    float* result,
-    const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2,
-    const size_t xsize, const size_t ysize,
-    const size_t step)
-{
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
-    size_t channel_size = xsize * ysize * sizeof(float);
-    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
-    ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-
-    CUdeviceptr mem_result = ocl.allocMem(channel_size, result);
-
-    CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size);
-    CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size);
-    CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size);
-
-    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
-
-    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
-    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    {
-        ocu_channels mask = ocl.allocMemChannels(channel_size);
-        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
-        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
-        cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
-
-        ocl.releaseMemChannels(mask);
-        ocl.releaseMemChannels(mask_dc);
-    }
-
-    cuCalculateDiffmapEx(mem_result, xsize, ysize, step);
-
-    cuMemcpyDtoH(result, mem_result, channel_size);
-
-    ocl.releaseMemChannels(xyb1);
-    ocl.releaseMemChannels(xyb0);
-
-    cuMemFree(edge_detector_map);
-    cuMemFree(block_diff_dc);
-    cuMemFree(block_diff_ac);
-
-    cuMemFree(mem_result);
-}
-
-void cuComputeBlockZeroingOrder(
-    guetzli::CoeffData *output_order_batch,
-    const channel_info orig_channel[3],
-    const float *orig_image_batch,
-    const float *mask_scale,
-    const int image_width,
-    const int image_height,
-    const channel_info mayout_channel[3],
-    const int factor,
-    const int comp_mask,
-    const float BlockErrorLimit)
-{
-    const int block8_width = (image_width + 8 - 1) / 8;
-    const int block8_height = (image_height + 8 - 1) / 8;
-    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
-    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
-
-    using namespace guetzli;
-
-    cl_int err = 0;
-    ocu_args_d_t &ocl = getOcu();
-
-    CUdeviceptr mem_orig_coeff[3];
-    CUdeviceptr mem_mayout_coeff[3];
-    CUdeviceptr mem_mayout_pixel[3];
-    for (int c = 0; c < 3; c++)
-    {
-        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
-        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
-
-        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
-        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
-
-        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
-    }
-    CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
-    CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
-
-    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
-    CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
-
-    const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2], 
-                           &mem_orig_image, &mem_orig_image, &mem_mask_scale,
-                           &image_width, &image_height,
-                           &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
-                           &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
-                           &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
-                           &factor,
-                           &comp_mask,
-                           &BlockErrorLimit,
-                           &mem_output_order_batch};
-
-    err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER],
-        blockf_width, blockf_height, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
-
-    for (int c = 0; c < 3; c++)
-    {
-        cuMemFree(mem_orig_coeff[c]);
-        cuMemFree(mem_mayout_coeff[c]);
-        cuMemFree(mem_mayout_pixel[c]);
-
-    }
-
-    cuMemFree(mem_orig_image);
-    cuMemFree(mem_mask_scale);
-    cuMemFree(mem_output_order_batch);
-}
-
-void cuMask(
-    float* mask_r, float* mask_g, float* mask_b,
-    float* maskdc_r, float* maskdc_g, float* maskdc_b,
-    const size_t xsize, const size_t ysize,
-    const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2)
-{
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocl = getOcu();
-
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
-    ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-    ocu_channels mask = ocl.allocMemChannels(channel_size);
-    ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
-
-    cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
-
-    cuMemcpyDtoH(mask_r, mask.r, channel_size);
-    cuMemcpyDtoH(mask_g, mask.g, channel_size);
-    cuMemcpyDtoH(mask_b, mask.b, channel_size);
-    cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
-    cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size);
-    cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size);
-
-    ocl.releaseMemChannels(rgb);
-    ocl.releaseMemChannels(rgb2);
-    ocl.releaseMemChannels(mask);
-    ocl.releaseMemChannels(mask_dc);
-}
-
-void cuConvolutionXEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
-{
-    ocu_args_d_t &ocu = getOcu();
-
-    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
-
-    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
-}
-
-void cuConvolutionYEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
-{
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-
-    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
-
-    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
-}
-
-void cuSquareSampleEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr image, size_t xsize, size_t ysize,
-    size_t xstep, size_t ystep)
-{
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-
-    const void *args[] = { &result, &image, &xstep, &ystep};
-
-    err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
-}
-
-void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
-    const double sigma, const double border_ratio,
-    CUdeviceptr result/*out, opt*/)
-{
-    double m = 2.25;  // Accuracy increases when m is increased.
-    const double scaler = -1.0 / (2 * sigma * sigma);
-    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
-    const int diff = std::max<int>(1, m * fabs(sigma));
-    const int expn_size = 2 * diff + 1;
-    std::vector<float> expn(expn_size);
-    for (int i = -diff; i <= diff; ++i) {
-        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
-    }
-
-    const int xstep = std::max<int>(1, int(sigma / 3));
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-    CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
-
-    if (xstep > 1)
-    {
-        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
-        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
-        cuMemFree(srcA);
-    }
-    else
-    {
-        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
-        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuMemFree(srcA);
-    }
-
-    cuMemFree(mem_expn);
-}
-
-void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
-{
-    static const double kSigma = 1.1;
-
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-    ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size);
-
-    cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
-    cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
-    cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
-
-    void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b};
-
-    CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE],
-        xsize * ysize, 1, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, args, NULL);
-
-    r = cuStreamSynchronize(ocu.stream);
-
-    ocu.releaseMemChannels(rgb_blurred);
-}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 8407a1c5..279884d6 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -6,6 +6,8 @@
 #include "ocl.h"
 #include "clguetzli.cl.h"
 
+#include "cuguetzli.h"
+
 extern bool g_useOpenCL;
 extern bool g_useCuda;
 extern bool g_checkOpenCL;
@@ -135,41 +137,6 @@ void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int
 
 void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
 
-////////////////////////////////////////////////////////////////
-void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize);
-
-void cuDiffmapOpsinDynamicsImage(
-    float* result,
-    const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2,
-    const size_t xsize, const size_t ysize,
-    const size_t step);
-
-void cuComputeBlockZeroingOrder(
-    guetzli::CoeffData *output_order_batch,
-    const channel_info orig_channel[3],
-    const float *orig_image_batch,
-    const float *mask_scale,
-    const int image_width,
-    const int image_height,
-    const channel_info mayout_channel[3],
-    const int factor,
-    const int comp_mask,
-    const float BlockErrorLimit);
-
-void cuMask(
-    float* mask_r, float* mask_g, float* mask_b,
-    float* maskdc_r, float* maskdc_g, float* maskdc_b,
-    const size_t xsize, const size_t ysize,
-    const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2);
-
-void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
-    const double sigma, const double border_ratio,
-    CUdeviceptr result = NULL/*out, opt*/);
-
-void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
-
 class guetzli::OutputImage;
 
 namespace guetzli {
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
new file mode 100644
index 00000000..1bb85334
--- /dev/null
+++ b/clguetzli/cuguetzli.cpp
@@ -0,0 +1,801 @@
+#include "cuguetzli.h"
+#include <algorithm>
+#include "ocu.h"
+
+void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
+
+    cuOpsinDynamicsImageEx(rgb, xsize, ysize);
+
+    cuMemcpyDtoH(r, rgb.r, channel_size);
+    cuMemcpyDtoH(g, rgb.g, channel_size);
+    cuMemcpyDtoH(b, rgb.b, channel_size);
+
+    ocu.releaseMemChannels(rgb);
+}
+
+void cuMaskHighIntensityChangeEx(
+    ocu_channels &xyb0/*in,out*/,
+    ocu_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+
+    ocu_channels c0 = ocl.allocMemChannels(channel_size);
+    ocu_channels c1 = ocl.allocMemChannels(channel_size);
+
+    cuMemcpyDtoD(c0.r, xyb0.r, channel_size);
+    cuMemcpyDtoD(c0.g, xyb0.g, channel_size);
+    cuMemcpyDtoD(c0.b, xyb0.b, channel_size);
+    cuMemcpyDtoD(c1.r, xyb1.r, channel_size);
+    cuMemcpyDtoD(c1.g, xyb1.g, channel_size);
+    cuMemcpyDtoD(c1.b, xyb1.b, channel_size);
+
+    const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b,
+        &xyb1.r, &xyb1.g, &xyb1.b,
+        &c0.r, &c0.g, &c0.b,
+        &c1.r, &c1.g, &c1.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(c0);
+    ocl.releaseMemChannels(c1);
+}
+
+void cuEdgeDetectorMapEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+
+    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+    static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+    }
+
+    const void *args[] = { &result,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blured);
+    ocl.releaseMemChannels(rgb2_blured);
+}
+
+void cuBlockDiffMapEx(
+    CUdeviceptr block_diff_dc/*out*/,
+    CUdeviceptr block_diff_ac/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &block_diff_dc, &block_diff_ac,
+        &rgb.r, &rgb.g, &rgb.b,
+        &rgb2.r, &rgb2.g, &rgb2.b,
+        &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuEdgeDetectorLowFreqEx(
+    CUdeviceptr block_diff_ac/*in,out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    static const double kSigma = 14;
+
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
+    }
+
+    const void *args[] = { &block_diff_ac,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blured);
+    ocl.releaseMemChannels(rgb2_blured);
+}
+
+void cuDiffPrecomputeEx(
+    ocu_channels &mask/*out*/,
+    const ocu_channels &xyb0, const ocu_channels &xyb1,
+    const size_t xsize, const size_t ysize)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &mask.x, &mask.y, &mask.b,
+        &xyb0.x, &xyb0.y, &xyb0.b,
+        &xyb1.x, &xyb1.y, &xyb1.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &img, &w };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE],
+        size, 1, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize)
+{
+    if (xsize < 4 || ysize < 4) {
+        // TODO: Make this work for small dimensions as well.
+        return;
+    }
+
+    ocu_args_d_t &ocl = getOcu();
+
+    size_t len = xsize * ysize * sizeof(float);
+    CUdeviceptr img_org = ocl.allocMem(len);
+
+    cuMemcpyDtoD(img_org, img, len);
+
+    const void *args[] = { &img, &img_org };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemFree(img_org);
+}
+
+void cuMinSquareValEx(
+    CUdeviceptr img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize);
+
+    const void *args[] = { &srcA, &img, &square_size, &offset };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize);
+    cuMemFree(srcA);
+}
+
+static void MakeMask(double extmul, double extoff,
+    double mul, double offset,
+    double scaler, double *result)
+{
+    for (size_t i = 0; i < 512; ++i) {
+        const double c = mul / ((0.01 * scaler * i) + offset);
+        result[i] = 1.0 + extmul * (c + extoff);
+        result[i] *= result[i];
+    }
+}
+
+static const double kInternalGoodQualityThreshold = 14.921561160295326;
+static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    double extmul = 0.975741017749;
+    double extoff = -4.25328244168;
+    double offset = 0.454909521427;
+    double scaler = 0.0738288224836;
+    double mul = 20.8029176447;
+    static double lut_x[512];
+    static bool lutx_init = false;
+    if (!lutx_init)
+    {
+        lutx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_x);
+    }
+
+    extmul = 0.373995618954;
+    extoff = 1.5307267433;
+    offset = 0.911952641929;
+    scaler = 1.1731667845;
+    mul = 16.2447033988;
+    static double lut_y[512];
+    static bool luty_init = false;
+    if (!luty_init)
+    {
+        luty_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_y);
+    }
+
+    extmul = 0.61582234137;
+    extoff = -4.25376118646;
+    offset = 1.05105070921;
+    scaler = 0.47434643535;
+    mul = 31.1444967089;
+    static double lut_b[512];
+    static bool lutb_init = false;
+    if (!lutb_init)
+    {
+        lutb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_b);
+    }
+
+    extmul = 1.79116943438;
+    extoff = -3.86797479189;
+    offset = 0.670960225853;
+    scaler = 0.486575865525;
+    mul = 20.4563479139;
+    static double lut_dcx[512];
+    static bool lutdcx_init = false;
+    if (!lutdcx_init)
+    {
+        lutdcx_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcx);
+    }
+
+    extmul = 0.212223514236;
+    extoff = -3.65647120524;
+    offset = 1.73396799447;
+    scaler = 0.170392660501;
+    mul = 21.6566724788;
+    static double lut_dcy[512];
+    static bool lutdcy_init = false;
+    if (!lutdcy_init)
+    {
+        lutdcy_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcy);
+    }
+
+    extmul = 0.349376011816;
+    extoff = -0.894711072781;
+    offset = 0.901647926679;
+    scaler = 0.380086095024;
+    mul = 18.0373825149;
+    static double lut_dcb[512];
+    static bool lutdcb_init = false;
+    if (!lutdcb_init)
+    {
+        lutdcb_init = true;
+        MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
+    }
+
+    size_t channel_size = 512 * 3 * sizeof(double);
+    ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
+    ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
+
+    const void *args[] = { &mask.r, &mask.g, &mask.b,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b,
+        &xyb.x, &xyb.y, &xyb.b,
+        &xyb_dc.x, &xyb_dc.y, &xyb_dc.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(xyb);
+    ocl.releaseMemChannels(xyb_dc);
+}
+
+void cuMaskEx(
+    ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize)
+{
+    cuDiffPrecomputeEx(mask, rgb, rgb2, xsize, ysize);
+    for (int i = 0; i < 3; i++)
+    {
+        cuAverage5x5Ex(mask.ch[i], xsize, ysize);
+        cuMinSquareValEx(mask.ch[i], xsize, ysize, 4, 0);
+
+        static const double sigma[3] = {
+            9.65781083553,
+            14.2644604355,
+            4.53358927369,
+        };
+
+        cuBlurEx(mask.ch[i], xsize, ysize, sigma[i], 0.0);
+    }
+
+    cuDoMask(mask, mask_dc, xsize, ysize);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuScaleImageEx(mask.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+        cuScaleImageEx(mask_dc.ch[i], xsize * ysize, kGlobalScale * kGlobalScale);
+    }
+}
+
+void cuCombineChannelsEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &mask,
+    const ocu_channels &mask_dc,
+    const size_t xsize, const size_t ysize,
+    const CUdeviceptr block_diff_dc,
+    const CUdeviceptr block_diff_ac,
+    const CUdeviceptr edge_detector_map,
+    const size_t res_xsize,
+    const size_t step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
+    const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
+
+    const void *args[] = { &result,
+        &mask.r, &mask.g, &mask.b,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b,
+        &xsize, &ysize,
+        &block_diff_dc, &block_diff_ac, &edge_detector_map,
+        &res_xsize,
+        &step };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS],
+        work_xsize, work_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
+
+    const void *args[] = { &diffmap_out,
+        &diffmap,
+        &xsize, &ysize,
+        &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
+
+    cuMemFree(diffmap_out);
+}
+
+void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    int cls = 8 - step;
+    int cls2 = (8 - step) / 2;
+
+    const void *args[] = { &out,
+        &in,
+        &xsize,
+        &cls,
+        &cls2 };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER],
+        xsize - cls, ysize - cls, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    int cls = 8 - step;
+    int cls2 = (8 - step) / 2;
+
+    const void *args[] = { &out,
+        &cls,
+        &cls2,
+        &in };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
+{
+    cuUpsampleSquareRootEx(diffmap, xsize, ysize, step);
+
+    static const double kSigma = 8.8510880283;
+    static const double mul1 = 24.8235314874;
+    static const double scale = 1.0 / (1.0 + mul1);
+
+    const int s = 8 - step;
+    int s2 = (8 - step) / 2;
+
+    ocu_args_d_t &ocl = getOcu();
+    CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+    cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
+
+    static const double border_ratio = 0.03027655136;
+    cuBlurEx(blurred, xsize - s, ysize - s, kSigma, border_ratio);
+
+    cuAddBorderEx(diffmap, xsize, ysize, step, blurred);
+    cuScaleImageEx(diffmap, xsize * ysize, scale);
+
+    cuMemFree(blurred);
+}
+
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+
+    CUdeviceptr mem_result = ocl.allocMem(channel_size, result);
+
+    CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size);
+
+    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocu_channels mask = ocl.allocMemChannels(channel_size);
+        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
+
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
+
+    cuCalculateDiffmapEx(mem_result, xsize, ysize, step);
+
+    cuMemcpyDtoH(result, mem_result, channel_size);
+
+    ocl.releaseMemChannels(xyb1);
+    ocl.releaseMemChannels(xyb0);
+
+    cuMemFree(edge_detector_map);
+    cuMemFree(block_diff_dc);
+    cuMemFree(block_diff_ac);
+
+    cuMemFree(mem_result);
+}
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit)
+{
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
+
+    using namespace guetzli;
+
+    cl_int err = 0;
+    ocu_args_d_t &ocl = getOcu();
+
+    CUdeviceptr mem_orig_coeff[3];
+    CUdeviceptr mem_mayout_coeff[3];
+    CUdeviceptr mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+
+        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+    }
+    CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
+
+    const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
+        &mem_orig_image, &mem_orig_image, &mem_mask_scale,
+        &image_width, &image_height,
+        &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
+        &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
+        &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
+        &factor,
+        &comp_mask,
+        &BlockErrorLimit,
+        &mem_output_order_batch };
+
+    err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER],
+        blockf_width, blockf_height, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
+
+    for (int c = 0; c < 3; c++)
+    {
+        cuMemFree(mem_orig_coeff[c]);
+        cuMemFree(mem_mayout_coeff[c]);
+        cuMemFree(mem_mayout_pixel[c]);
+
+    }
+
+    cuMemFree(mem_orig_image);
+    cuMemFree(mem_mask_scale);
+    cuMemFree(mem_output_order_batch);
+}
+
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocl = getOcu();
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+    ocu_channels mask = ocl.allocMemChannels(channel_size);
+    ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+
+    cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
+
+    cuMemcpyDtoH(mask_r, mask.r, channel_size);
+    cuMemcpyDtoH(mask_g, mask.g, channel_size);
+    cuMemcpyDtoH(mask_b, mask.b, channel_size);
+    cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
+    cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size);
+    cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size);
+
+    ocl.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb2);
+    ocl.releaseMemChannels(mask);
+    ocl.releaseMemChannels(mask_dc);
+}
+
+void cuConvolutionXEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
+{
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuConvolutionYEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuSquareSampleEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &image, &xstep, &ystep };
+
+    err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    CUdeviceptr result/*out, opt*/)
+{
+    double m = 2.25;  // Accuracy increases when m is increased.
+    const double scaler = -1.0 / (2 * sigma * sigma);
+    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+    const int diff = std::max<int>(1, m * fabs(sigma));
+    const int expn_size = 2 * diff + 1;
+    std::vector<float> expn(expn_size);
+    for (int i = -diff; i <= diff; ++i) {
+        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+    }
+
+    const int xstep = std::max<int>(1, int(sigma / 3));
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+    CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
+
+    if (xstep > 1)
+    {
+        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        cuMemFree(srcA);
+    }
+    else
+    {
+        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuMemFree(srcA);
+    }
+
+    cuMemFree(mem_expn);
+}
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
+{
+    static const double kSigma = 1.1;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size);
+
+    cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+    cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+    cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+
+    void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
+
+    CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE],
+        xsize * ysize, 1, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, args, NULL);
+
+    r = cuStreamSynchronize(ocu.stream);
+
+    ocu.releaseMemChannels(rgb_blurred);
+}
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
new file mode 100644
index 00000000..14c607cc
--- /dev/null
+++ b/clguetzli/cuguetzli.h
@@ -0,0 +1,37 @@
+#pragma once
+#include "guetzli/processor.h"
+#include "clguetzli.cl.h"
+
+void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize);
+
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit);
+
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2);
+
+void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    CUdeviceptr result = NULL/*out, opt*/);
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index b8798eb2..fc36b9a0 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -204,6 +204,7 @@
     <ClInclude Include="clguetzli\clguetzli.cl.h" />
     <ClInclude Include="clguetzli\clguetzli.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
+    <ClInclude Include="clguetzli\cuguetzli.h" />
     <ClInclude Include="clguetzli\ocl.h" />
     <ClInclude Include="clguetzli\ocu.h" />
     <ClInclude Include="clguetzli\utils.h" />
@@ -302,6 +303,7 @@
     <ClCompile Include="clguetzli\clguetzli.cl.cpp" />
     <ClCompile Include="clguetzli\clguetzli.cpp" />
     <ClCompile Include="clguetzli\clguetzli_test.cpp" />
+    <ClCompile Include="clguetzli\cuguetzli.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
     <ClCompile Include="clguetzli\ocu.cpp" />
     <ClCompile Include="clguetzli\utils.cpp" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 07f56763..38921bde 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -315,6 +315,9 @@
     <ClInclude Include="clguetzli\ocu.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\cuguetzli.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -587,6 +590,9 @@
     <ClCompile Include="clguetzli\ocu.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\cuguetzli.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 4690aff1..63ebb609 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -567,7 +567,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
     CoeffData * output_order = NULL;
     ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
 
-    if (g_useOpenCL || g_useCuda || g_checkOpenCL)
+    if (g_useOpenCL || g_checkOpenCL)
     {
         channel_info orig_channel[3];
         channel_info mayout_channel[3];

From 5d49f244ed2209d791bc409a298215019f8f3ca0 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 3 Jun 2017 02:08:39 +0800
Subject: [PATCH 126/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp |  73 +---
 clguetzli/clguetzli.h   |  10 +-
 clguetzli/cuguetzli.cpp | 804 ++++++++++++++++++++--------------------
 clguetzli/cuguetzli.h   |  82 +++-
 4 files changed, 492 insertions(+), 477 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index f50ce17c..0774a074 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -89,8 +89,8 @@ void clDiffmapOpsinDynamicsImage(
     float* result,
     const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2,
-    size_t xsize, size_t ysize,
-    size_t step)
+    const size_t xsize, const size_t ysize,
+    const size_t step)
 {
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
@@ -98,7 +98,6 @@ void clDiffmapOpsinDynamicsImage(
     size_t channel_size = xsize * ysize * sizeof(float);
     size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
 
-    cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
     ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
     ocl_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
@@ -127,7 +126,7 @@ void clDiffmapOpsinDynamicsImage(
     clCalculateDiffmapEx(mem_result, xsize, ysize, step);
 
     clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL);
-    err = clFinish(ocl.commandQueue);
+    cl_int err = clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(xyb1);
     ocl.releaseMemChannels(xyb0);
@@ -242,7 +241,6 @@ void clMask(
     const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2)
 {
-    cl_int err = CL_SUCCESS;
     ocl_args_d_t &ocl = getOcl();
 
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -260,7 +258,7 @@ void clMask(
     clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL);
     clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL);
     clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL);
-    err = clFinish(ocl.commandQueue);
+    cl_int err = clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
     ocl.releaseMemChannels(rgb2);
@@ -410,54 +408,7 @@ void clSquareSampleEx(
 }
 
 void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
-    const double sigma, const double border_ratio,
-    cl_mem result/*out, opt*/)
-{
-    clBlurEx2(image, xsize, ysize, sigma, border_ratio, result);
-
-    return;
-/*
-    double m = 2.25;  // Accuracy increases when m is increased.
-    const double scaler = -1.0 / (2 * sigma * sigma);
-    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
-    const int diff = std::max<int>(1, m * fabs(sigma));
-    const int expn_size = 2 * diff + 1;
-    std::vector<float> expn(expn_size);
-    for (int i = -diff; i <= diff; ++i) {
-        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
-    }
-
-    const int xstep = std::max<int>(1, int(sigma / 3));
-    const int ystep = xstep;
-    int dxsize = (xsize + xstep - 1) / xstep;
-    int dysize = (ysize + ystep - 1) / ystep;
-
-    cl_int err = 0;
-    ocl_args_d_t &ocl = getOcl();
-    cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
-
-    if (xstep > 1)
-    {
-        ocl.allocA(sizeof(cl_float) * dxsize * ysize);
-        ocl.allocB(sizeof(cl_float) * dxsize * dysize);
-
-        clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        clConvolutionEx(ocl.srcB, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio);
-        clUpsampleEx(result ? result : image, ocl.srcB, xsize, ysize, xstep, ystep);
-    }
-    else
-    {
-        ocl.allocA(sizeof(cl_float) * xsize * ysize);
-        clConvolutionEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        clConvolutionEx(result ? result : image, ocl.srcA, ysize, dxsize, mem_expn, expn_size, ystep, diff, border_ratio);
-    }
-
-    clReleaseMemObject(mem_expn);
-*/
-}
-
-void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
-	double sigma, double border_ratio,
+	const double sigma, const double border_ratio,
     cl_mem result/*out, opt*/)
 {
 	double m = 2.25;  // Accuracy increases when m is increased.
@@ -538,7 +489,6 @@ void clMaskHighIntensityChangeEx(
 {
 	size_t channel_size = xsize * ysize * sizeof(float);
 
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	ocl_channels c0 = ocl.allocMemChannels(channel_size);
@@ -550,7 +500,7 @@ void clMaskHighIntensityChangeEx(
 	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL);
 	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL);
 	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL);
-	err = clFinish(ocl.commandQueue);
+	cl_int err = clFinish(ocl.commandQueue);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r);
@@ -588,8 +538,6 @@ void clEdgeDetectorMapEx(
     const size_t xsize, const size_t ysize, const size_t step)
 {
 	size_t channel_size = xsize * ysize * sizeof(float);
-
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
@@ -623,7 +571,7 @@ void clEdgeDetectorMapEx(
 	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize};
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clEdgeDetectorMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
@@ -644,7 +592,6 @@ void clBlockDiffMapEx(
     const ocl_channels &rgb, const ocl_channels &rgb2,
 	const size_t xsize, const size_t ysize, const size_t step)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int clxsize = xsize;
@@ -668,7 +615,7 @@ void clBlockDiffMapEx(
 	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clBlockDiffMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
@@ -688,8 +635,6 @@ void clEdgeDetectorLowFreqEx(
 	size_t channel_size = xsize * ysize * sizeof(float);
 
 	static const double kSigma = 14;
-
-	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
 	ocl_channels rgb2_blured = ocl.allocMemChannels(channel_size);
@@ -720,7 +665,7 @@ void clEdgeDetectorLowFreqEx(
 	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
 	if (CL_SUCCESS != err)
 	{
 		LogError("Error: clEdgeDetectorLowFreqEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 279884d6..31b10e36 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <vector>
-#include "CL/cl.h"
 #include "guetzli/processor.h"
 #include "guetzli/butteraugli_comparator.h"
 #include "ocl.h"
@@ -33,8 +32,7 @@ void clComputeBlockZeroingOrder(
     const channel_info mayout_channel[3],
     const int factor,
     const int comp_mask,
-    const float BlockErrorLimit
-    );
+    const float BlockErrorLimit);
 
 void clMask(
     float* mask_r,   float* mask_g,   float* mask_b,
@@ -70,10 +68,6 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
     const double sigma, const double border_ratio,
     cl_mem result = nullptr/*out, opt*/);
 
-void clBlurEx2(cl_mem image/*out, opt*/, size_t xsize, size_t ysize,
-    double sigma, double border_ratio,
-    cl_mem result = NULL/*out, opt*/);
-
 void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t ysize);
 
 void clMaskHighIntensityChangeEx(
@@ -129,8 +123,6 @@ void clCombineChannelsEx(
 
 void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step);
 
-void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, size_t xsize, size_t ysize, int step);
-
 void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step);
 
 void clAddBorderEx(cl_mem out, const size_t xsize, const size_t ysize, const int step, const cl_mem in);
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 1bb85334..5eaba7e5 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -6,189 +6,472 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
 {
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_args_d_t &ocu = getOcu();
-    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
 
     cuOpsinDynamicsImageEx(rgb, xsize, ysize);
 
     cuMemcpyDtoH(r, rgb.r, channel_size);
     cuMemcpyDtoH(g, rgb.g, channel_size);
-    cuMemcpyDtoH(b, rgb.b, channel_size);
+	cuMemcpyDtoH(b, rgb.b, channel_size);
 
-    ocu.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb);
 }
 
-void cuMaskHighIntensityChangeEx(
-    ocu_channels &xyb0/*in,out*/,
-    ocu_channels &xyb1/*in,out*/,
-    const size_t xsize, const size_t ysize)
+void cuDiffmapOpsinDynamicsImage(
+    float* result,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
 {
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
     size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
 
     ocu_args_d_t &ocl = getOcu();
+    ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
-    ocu_channels c0 = ocl.allocMemChannels(channel_size);
-    ocu_channels c1 = ocl.allocMemChannels(channel_size);
+    CUdeviceptr mem_result = ocl.allocMem(channel_size, result);
 
-    cuMemcpyDtoD(c0.r, xyb0.r, channel_size);
-    cuMemcpyDtoD(c0.g, xyb0.g, channel_size);
-    cuMemcpyDtoD(c0.b, xyb0.b, channel_size);
-    cuMemcpyDtoD(c1.r, xyb1.r, channel_size);
-    cuMemcpyDtoD(c1.g, xyb1.g, channel_size);
-    cuMemcpyDtoD(c1.b, xyb1.b, channel_size);
+    CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size);
 
-    const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b,
-        &xyb1.r, &xyb1.g, &xyb1.b,
-        &c0.r, &c0.g, &c0.b,
-        &c1.r, &c1.g, &c1.b };
+    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
+    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocu_channels mask = ocl.allocMemChannels(channel_size);
+        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
 
-    err = cuStreamSynchronize(ocl.stream);
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
 
-    ocl.releaseMemChannels(c0);
-    ocl.releaseMemChannels(c1);
+    cuCalculateDiffmapEx(mem_result, xsize, ysize, step);
+
+    cuMemcpyDtoH(result, mem_result, channel_size);
+
+    ocl.releaseMemChannels(xyb1);
+    ocl.releaseMemChannels(xyb0);
+
+    cuMemFree(edge_detector_map);
+    cuMemFree(block_diff_dc);
+    cuMemFree(block_diff_ac);
+
+    cuMemFree(mem_result);
 }
 
-void cuEdgeDetectorMapEx(
-    CUdeviceptr result/*out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step)
+
+void cuComputeBlockZeroingOrder(
+    guetzli::CoeffData *output_order_batch,
+    const channel_info orig_channel[3],
+    const float *orig_image_batch,
+    const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const channel_info mayout_channel[3],
+    const int factor,
+    const int comp_mask,
+    const float BlockErrorLimit)
 {
-    size_t channel_size = xsize * ysize * sizeof(float);
+    const int block8_width = (image_width + 8 - 1) / 8;
+    const int block8_height = (image_height + 8 - 1) / 8;
+    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
+    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
+
+    using namespace guetzli;
 
+    cl_int err = 0;
     ocu_args_d_t &ocl = getOcu();
 
-    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
-    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+    CUdeviceptr mem_orig_coeff[3];
+    CUdeviceptr mem_mayout_coeff[3];
+    CUdeviceptr mem_mayout_pixel[3];
+    for (int c = 0; c < 3; c++)
+    {
+        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
+        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
 
-    static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
+        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
 
-    for (int i = 0; i < 3; i++)
-    {
-        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
-        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
     }
+    CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
 
-    const void *args[] = { &result,
-        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
-        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
-        &xsize, &ysize, &step };
+    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
+    CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
 
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
+    const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
+        &mem_orig_image, &mem_orig_image, &mem_mask_scale,
+        &image_width, &image_height,
+        &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
+        &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
+        &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
+        &factor,
+        &comp_mask,
+        &BlockErrorLimit,
+        &mem_output_order_batch };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR],
-        res_xsize, res_ysize, 1,
+    err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER],
+        blockf_width, blockf_height, 1,
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
 
     err = cuStreamSynchronize(ocl.stream);
 
-    ocl.releaseMemChannels(rgb_blured);
-    ocl.releaseMemChannels(rgb2_blured);
-}
-
-void cuBlockDiffMapEx(
-    CUdeviceptr block_diff_dc/*out*/,
-    CUdeviceptr block_diff_ac/*out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step)
-{
-    ocu_args_d_t &ocl = getOcu();
-
-    const void *args[] = { &block_diff_dc, &block_diff_ac,
-        &rgb.r, &rgb.g, &rgb.b,
-        &rgb2.r, &rgb2.g, &rgb2.b,
-        &xsize, &ysize, &step };
+    cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
 
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
+    for (int c = 0; c < 3; c++)
+    {
+        cuMemFree(mem_orig_coeff[c]);
+        cuMemFree(mem_mayout_coeff[c]);
+        cuMemFree(mem_mayout_pixel[c]);
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP],
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
+    }
 
-    err = cuStreamSynchronize(ocl.stream);
+    cuMemFree(mem_orig_image);
+    cuMemFree(mem_mask_scale);
+    cuMemFree(mem_output_order_batch);
 }
 
-void cuEdgeDetectorLowFreqEx(
-    CUdeviceptr block_diff_ac/*in,out*/,
-    const ocu_channels &rgb, const ocu_channels &rgb2,
-    const size_t xsize, const size_t ysize, const size_t step)
+void cuMask(
+    float* mask_r, float* mask_g, float* mask_b,
+    float* maskdc_r, float* maskdc_g, float* maskdc_b,
+    const size_t xsize, const size_t ysize,
+    const float* r, const float* g, const float* b,
+    const float* r2, const float* g2, const float* b2)
 {
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    static const double kSigma = 14;
-
     ocu_args_d_t &ocl = getOcu();
-    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
-    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
-
-    for (int i = 0; i < 3; i++)
-    {
-        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
-        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
-    }
 
-    const void *args[] = { &block_diff_ac,
-        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
-        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
-        &xsize, &ysize, &step };
+    size_t channel_size = xsize * ysize * sizeof(float);
 
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
+    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+    ocu_channels mask = ocl.allocMemChannels(channel_size);
+    ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ],
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
+    cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
 
-    err = cuStreamSynchronize(ocl.stream);
+    cuMemcpyDtoH(mask_r, mask.r, channel_size);
+    cuMemcpyDtoH(mask_g, mask.g, channel_size);
+    cuMemcpyDtoH(mask_b, mask.b, channel_size);
+    cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
+    cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size);
+    cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size);
 
-    ocl.releaseMemChannels(rgb_blured);
-    ocl.releaseMemChannels(rgb2_blured);
+    ocl.releaseMemChannels(rgb);
+    ocl.releaseMemChannels(rgb2);
+    ocl.releaseMemChannels(mask);
+    ocl.releaseMemChannels(mask_dc);
 }
 
-void cuDiffPrecomputeEx(
-    ocu_channels &mask/*out*/,
-    const ocu_channels &xyb0, const ocu_channels &xyb1,
-    const size_t xsize, const size_t ysize)
+void cuConvolutionXEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-    const void *args[] = { &mask.x, &mask.y, &mask.b,
-        &xyb0.x, &xyb0.y, &xyb0.b,
-        &xyb1.x, &xyb1.y, &xyb1.b };
+    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE],
+    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocu.stream, (void**)args, NULL);
 
-    err = cuStreamSynchronize(ocl.stream);
+    err = cuStreamSynchronize(ocu.stream);
 }
 
-void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
+void cuConvolutionYEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio)
 {
-    ocu_args_d_t &ocl = getOcu();
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
 
-    const void *args[] = { &img, &w };
+    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE],
-        size, 1, 1,
+    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY],
+        xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuSquareSampleEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep)
+{
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+
+    const void *args[] = { &result, &image, &xstep, &ystep };
+
+    err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocu.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocu.stream);
+}
+
+void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+    const double sigma, const double border_ratio,
+    CUdeviceptr result/*out, opt*/)
+{
+    double m = 2.25;  // Accuracy increases when m is increased.
+    const double scaler = -1.0 / (2 * sigma * sigma);
+    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+    const int diff = std::max<int>(1, m * fabs(sigma));
+    const int expn_size = 2 * diff + 1;
+    std::vector<float> expn(expn_size);
+    for (int i = -diff; i <= diff; ++i) {
+        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+    }
+
+    const int xstep = std::max<int>(1, int(sigma / 3));
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocu = getOcu();
+    CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
+
+    if (xstep > 1)
+    {
+        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        cuMemFree(srcA);
+    }
+    else
+    {
+        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuMemFree(srcA);
+    }
+
+    cuMemFree(mem_expn);
+}
+
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
+{
+    static const double kSigma = 1.1;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    CUresult err = CUDA_SUCCESS;
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size);
+
+    cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+    cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+    cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+
+    void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
+
+    CUresult r = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE],
+        xsize * ysize, 1, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, args, NULL);
+
+    r = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blurred);
+}
+
+void cuMaskHighIntensityChangeEx(
+    ocu_channels &xyb0/*in,out*/,
+    ocu_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+
+    ocu_channels c0 = ocl.allocMemChannels(channel_size);
+    ocu_channels c1 = ocl.allocMemChannels(channel_size);
+
+    cuMemcpyDtoD(c0.r, xyb0.r, channel_size);
+    cuMemcpyDtoD(c0.g, xyb0.g, channel_size);
+    cuMemcpyDtoD(c0.b, xyb0.b, channel_size);
+    cuMemcpyDtoD(c1.r, xyb1.r, channel_size);
+    cuMemcpyDtoD(c1.g, xyb1.g, channel_size);
+    cuMemcpyDtoD(c1.b, xyb1.b, channel_size);
+
+    const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b,
+        &xyb1.r, &xyb1.g, &xyb1.b,
+        &c0.r, &c0.g, &c0.b,
+        &c1.r, &c1.g, &c1.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(c0);
+    ocl.releaseMemChannels(c1);
+}
+
+void cuEdgeDetectorMapEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+
+    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+    static const double kSigma[3] = { 1.5, 0.586, 0.4 };
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma[i], 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
+    }
+
+    const void *args[] = { &result,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blured);
+    ocl.releaseMemChannels(rgb2_blured);
+}
+
+void cuBlockDiffMapEx(
+    CUdeviceptr block_diff_dc/*out*/,
+    CUdeviceptr block_diff_ac/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &block_diff_dc, &block_diff_ac,
+        &rgb.r, &rgb.g, &rgb.b,
+        &rgb2.r, &rgb2.g, &rgb2.b,
+        &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuEdgeDetectorLowFreqEx(
+    CUdeviceptr block_diff_ac/*in,out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step)
+{
+    size_t channel_size = xsize * ysize * sizeof(float);
+
+    static const double kSigma = 14;
+
+    ocu_args_d_t &ocl = getOcu();
+    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+
+    for (int i = 0; i < 3; i++)
+    {
+        cuBlurEx(rgb.ch[i], xsize, ysize, kSigma, 0.0, rgb_blured.ch[i]);
+        cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
+    }
+
+    const void *args[] = { &block_diff_ac,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step };
+
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ],
+        res_xsize, res_ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+
+    ocl.releaseMemChannels(rgb_blured);
+    ocl.releaseMemChannels(rgb2_blured);
+}
+
+void cuDiffPrecomputeEx(
+    ocu_channels &mask/*out*/,
+    const ocu_channels &xyb0, const ocu_channels &xyb1,
+    const size_t xsize, const size_t ysize)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &mask.x, &mask.y, &mask.b,
+        &xyb0.x, &xyb0.y, &xyb0.b,
+        &xyb1.x, &xyb1.y, &xyb1.b };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE],
+        xsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    const void *args[] = { &img, &w };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE],
+        size, 1, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
 
     err = cuStreamSynchronize(ocl.stream);
 }
@@ -516,286 +799,3 @@ void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, con
     cuMemFree(blurred);
 }
 
-void cuDiffmapOpsinDynamicsImage(
-    float* result,
-    const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2,
-    const size_t xsize, const size_t ysize,
-    const size_t step)
-{
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
-    size_t channel_size = xsize * ysize * sizeof(float);
-    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
-    ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-
-    CUdeviceptr mem_result = ocl.allocMem(channel_size, result);
-
-    CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size);
-    CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size);
-    CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size);
-
-    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
-
-    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
-    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    {
-        ocu_channels mask = ocl.allocMemChannels(channel_size);
-        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
-        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
-        cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
-
-        ocl.releaseMemChannels(mask);
-        ocl.releaseMemChannels(mask_dc);
-    }
-
-    cuCalculateDiffmapEx(mem_result, xsize, ysize, step);
-
-    cuMemcpyDtoH(result, mem_result, channel_size);
-
-    ocl.releaseMemChannels(xyb1);
-    ocl.releaseMemChannels(xyb0);
-
-    cuMemFree(edge_detector_map);
-    cuMemFree(block_diff_dc);
-    cuMemFree(block_diff_ac);
-
-    cuMemFree(mem_result);
-}
-
-void cuComputeBlockZeroingOrder(
-    guetzli::CoeffData *output_order_batch,
-    const channel_info orig_channel[3],
-    const float *orig_image_batch,
-    const float *mask_scale,
-    const int image_width,
-    const int image_height,
-    const channel_info mayout_channel[3],
-    const int factor,
-    const int comp_mask,
-    const float BlockErrorLimit)
-{
-    const int block8_width = (image_width + 8 - 1) / 8;
-    const int block8_height = (image_height + 8 - 1) / 8;
-    const int blockf_width = (image_width + 8 * factor - 1) / (8 * factor);
-    const int blockf_height = (image_height + 8 * factor - 1) / (8 * factor);
-
-    using namespace guetzli;
-
-    cl_int err = 0;
-    ocu_args_d_t &ocl = getOcu();
-
-    CUdeviceptr mem_orig_coeff[3];
-    CUdeviceptr mem_mayout_coeff[3];
-    CUdeviceptr mem_mayout_pixel[3];
-    for (int c = 0; c < 3; c++)
-    {
-        int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
-        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
-
-        block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
-        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
-
-        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
-    }
-    CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
-    CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
-
-    int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
-    CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
-
-    const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
-        &mem_orig_image, &mem_orig_image, &mem_mask_scale,
-        &image_width, &image_height,
-        &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
-        &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
-        &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
-        &factor,
-        &comp_mask,
-        &BlockErrorLimit,
-        &mem_output_order_batch };
-
-    err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER],
-        blockf_width, blockf_height, 1,
-        1, 1, 1,
-        0,
-        ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
-    cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
-
-    for (int c = 0; c < 3; c++)
-    {
-        cuMemFree(mem_orig_coeff[c]);
-        cuMemFree(mem_mayout_coeff[c]);
-        cuMemFree(mem_mayout_pixel[c]);
-
-    }
-
-    cuMemFree(mem_orig_image);
-    cuMemFree(mem_mask_scale);
-    cuMemFree(mem_output_order_batch);
-}
-
-void cuMask(
-    float* mask_r, float* mask_g, float* mask_b,
-    float* maskdc_r, float* maskdc_g, float* maskdc_b,
-    const size_t xsize, const size_t ysize,
-    const float* r, const float* g, const float* b,
-    const float* r2, const float* g2, const float* b2)
-{
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocl = getOcu();
-
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
-    ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-    ocu_channels mask = ocl.allocMemChannels(channel_size);
-    ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
-
-    cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
-
-    cuMemcpyDtoH(mask_r, mask.r, channel_size);
-    cuMemcpyDtoH(mask_g, mask.g, channel_size);
-    cuMemcpyDtoH(mask_b, mask.b, channel_size);
-    cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
-    cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size);
-    cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size);
-
-    ocl.releaseMemChannels(rgb);
-    ocl.releaseMemChannels(rgb2);
-    ocl.releaseMemChannels(mask);
-    ocl.releaseMemChannels(mask_dc);
-}
-
-void cuConvolutionXEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
-{
-    ocu_args_d_t &ocu = getOcu();
-
-    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
-
-    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
-}
-
-void cuConvolutionYEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
-{
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-
-    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
-
-    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
-}
-
-void cuSquareSampleEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr image, size_t xsize, size_t ysize,
-    size_t xstep, size_t ystep)
-{
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-
-    const void *args[] = { &result, &image, &xstep, &ystep };
-
-    err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
-        xsize, ysize, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
-}
-
-void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
-    const double sigma, const double border_ratio,
-    CUdeviceptr result/*out, opt*/)
-{
-    double m = 2.25;  // Accuracy increases when m is increased.
-    const double scaler = -1.0 / (2 * sigma * sigma);
-    // For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
-    const int diff = std::max<int>(1, m * fabs(sigma));
-    const int expn_size = 2 * diff + 1;
-    std::vector<float> expn(expn_size);
-    for (int i = -diff; i <= diff; ++i) {
-        expn[i + diff] = static_cast<float>(exp(scaler * i * i));
-    }
-
-    const int xstep = std::max<int>(1, int(sigma / 3));
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-    CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
-
-    if (xstep > 1)
-    {
-        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
-        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
-        cuMemFree(srcA);
-    }
-    else
-    {
-        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
-        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuMemFree(srcA);
-    }
-
-    cuMemFree(mem_expn);
-}
-
-void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
-{
-    static const double kSigma = 1.1;
-
-    size_t channel_size = xsize * ysize * sizeof(float);
-
-    CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
-    ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size);
-
-    cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
-    cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
-    cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
-
-    void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
-
-    CUresult r = cuLaunchKernel(ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE],
-        xsize * ysize, 1, 1,
-        1, 1, 1,
-        0,
-        ocu.stream, args, NULL);
-
-    r = cuStreamSynchronize(ocu.stream);
-
-    ocu.releaseMemChannels(rgb_blurred);
-}
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
index 14c607cc..0783796a 100644
--- a/clguetzli/cuguetzli.h
+++ b/clguetzli/cuguetzli.h
@@ -2,7 +2,9 @@
 #include "guetzli/processor.h"
 #include "clguetzli.cl.h"
 
-void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize);
+void cuOpsinDynamicsImage(
+	float *r, float *g, float *b, 
+	const size_t xsize, const size_t ysize);
 
 void cuDiffmapOpsinDynamicsImage(
     float* result,
@@ -30,8 +32,84 @@ void cuMask(
     const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2);
 
+void cuConvolutionXEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio);
+
+void cuConvolutionYEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, double border_ratio);
+
+void cuSquareSampleEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr image, size_t xsize, size_t ysize,
+    size_t xstep, size_t ystep);
+
 void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
     const double sigma, const double border_ratio,
     CUdeviceptr result = NULL/*out, opt*/);
 
-void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
\ No newline at end of file
+void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
+
+void cuMaskHighIntensityChangeEx(
+    ocu_channels &xyb0/*in,out*/,
+    ocu_channels &xyb1/*in,out*/,
+    const size_t xsize, const size_t ysize);
+
+void cuEdgeDetectorMapEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void cuBlockDiffMapEx(
+    CUdeviceptr block_diff_dc/*out*/,
+    CUdeviceptr block_diff_ac/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void cuEdgeDetectorLowFreqEx(
+    CUdeviceptr block_diff_ac/*in,out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize, const size_t step);
+
+void cuDiffPrecomputeEx(
+    ocu_channels &mask/*out*/,
+    const ocu_channels &xyb0, const ocu_channels &xyb1,
+    const size_t xsize, const size_t ysize);
+
+void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w);
+
+void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize);
+
+void cuMinSquareValEx(
+    CUdeviceptr img/*in,out*/,
+    const size_t xsize, const size_t ysize,
+    const size_t square_size, const size_t offset);
+
+void cuMaskEx(
+    ocu_channels mask/*out*/, ocu_channels mask_dc/*out*/,
+    const ocu_channels &rgb, const ocu_channels &rgb2,
+    const size_t xsize, const size_t ysize);
+
+void cuCombineChannelsEx(
+    CUdeviceptr result/*out*/,
+    const ocu_channels &mask,
+    const ocu_channels &mask_dc,
+    const size_t xsize, const size_t ysize,
+    const CUdeviceptr block_diff_dc,
+    const CUdeviceptr block_diff_ac,
+    const CUdeviceptr edge_detector_map,
+    const size_t res_xsize,
+    const size_t step);
+
+void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step);
+
+void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step);
+
+void cuAddBorderEx(CUdeviceptr out, const size_t xsize, const size_t ysize, const int step, const CUdeviceptr in);
+
+void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);

From b4d0ffe9dde8b2ee45ccfe043d08de7043ba957e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 3 Jun 2017 02:49:23 +0800
Subject: [PATCH 127/189] =?UTF-8?q?=E7=AE=80=E5=8C=96=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp      | 311 ++++++++++-------------------------
 clguetzli/clguetzli_test.cpp |   6 +-
 clguetzli/cuguetzli.cpp      |  16 +-
 clguetzli/ocl.cpp            | 133 +--------------
 clguetzli/ocl.h              |  20 +--
 5 files changed, 104 insertions(+), 382 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 0774a074..c6c5eb4f 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -16,10 +16,7 @@ ocl_args_d_t& getOcl(void)
 
 	bInit = true;
 	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
 	char* source = nullptr;
 	size_t src_size = 0;
@@ -30,21 +27,18 @@ ocl_args_d_t& getOcl(void)
 	delete[] source;
 
 	err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
-
-        if (err == CL_BUILD_PROGRAM_FAILURE)
-        {
-            size_t log_size = 0;
-            clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+    LOG_CL_RESULT(err);
+    if (CL_BUILD_PROGRAM_FAILURE == err)
+    {
+        size_t log_size = 0;
+        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 
-            std::vector<char> build_log(log_size);
-            clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
+        std::vector<char> build_log(log_size);
+        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
 
-            LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
-        }
-	}
+        LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
+    }
+	
     ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err);
     ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err);
     ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err);
@@ -208,15 +202,9 @@ void clComputeBlockZeroingOrder(
 
     size_t globalWorkSize[2] = { blockf_width, blockf_height };
     err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-    if (CL_SUCCESS != err)
-    {
-        LogError("Error: clComputeBlockZeroingOrder() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-    }
+    LOG_CL_RESULT(err);
     err = clFinish(ocl.commandQueue);
-    if (CL_SUCCESS != err)
-    {
-        LogError("Error: clComputeBlockZeroingOrder() clFinish returned %s.\n", TranslateOpenCLError(err));
-    }
+    LOG_CL_RESULT(err);
 
     clEnqueueReadBuffer(ocl.commandQueue, mem_output_order_batch, false, 0, output_order_batch_size, output_order_batch, 0, NULL, NULL);
     clFinish(ocl.commandQueue);
@@ -226,7 +214,6 @@ void clComputeBlockZeroingOrder(
         clReleaseMemObject(mem_orig_coeff[c]);
         clReleaseMemObject(mem_mayout_coeff[c]);
         clReleaseMemObject(mem_mayout_pixel[c]);
-
     }
 
     clReleaseMemObject(mem_orig_image);
@@ -258,7 +245,7 @@ void clMask(
     clEnqueueReadBuffer(ocl.commandQueue, mask_dc.r, false, 0, channel_size, maskdc_r, 0, NULL, NULL);
     clEnqueueReadBuffer(ocl.commandQueue, mask_dc.g, false, 0, channel_size, maskdc_g, 0, NULL, NULL);
     clEnqueueReadBuffer(ocl.commandQueue, mask_dc.b, false, 0, channel_size, maskdc_b, 0, NULL, NULL);
-    cl_int err = clFinish(ocl.commandQueue);
+    clFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
     ocl.releaseMemChannels(rgb2);
@@ -272,7 +259,6 @@ void clConvolutionEx(
     const cl_mem multipliers, size_t len,
     int xstep, int offset, double border_ratio)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	size_t oxsize = (xsize + xstep - 1) / xstep;
@@ -294,16 +280,10 @@ void clConvolutionEx(
 	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
 
 	size_t globalWorkSize[2] = { oxsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clConvolutionXEx(
@@ -312,7 +292,6 @@ void clConvolutionXEx(
 	const cl_mem multipliers, size_t len,
 	int xstep, int offset, double border_ratio)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int clxstep = xstep;
@@ -330,16 +309,10 @@ void clConvolutionXEx(
 	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clConvolutionYEx(
@@ -348,7 +321,6 @@ void clConvolutionYEx(
 	const cl_mem multipliers, size_t len,
 	int xstep, int offset, double border_ratio)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int clxstep = xstep;
@@ -366,16 +338,10 @@ void clConvolutionYEx(
 	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clConvolutionEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clConvolutionEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clSquareSampleEx(
@@ -383,7 +349,6 @@ void clSquareSampleEx(
     const cl_mem image, size_t xsize, size_t ysize,
 	size_t xstep, size_t ystep)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int clxstep = xstep;
@@ -395,16 +360,10 @@ void clSquareSampleEx(
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleEx clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleEx clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
@@ -423,23 +382,24 @@ void clBlurEx(cl_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
 
 	const int xstep = std::max<int>(1, int(sigma / 3));
 
-	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 	cl_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
 	if (xstep > 1)
 	{
-		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-		clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+		clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
         clSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
+        clReleaseMemObject(m);
 	}
 	else
 	{
-		ocl.allocA(sizeof(cl_float) * xsize * ysize);
-		clConvolutionXEx(ocl.srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-		clConvolutionYEx(result ? result : image, ocl.srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-	}
+        cl_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+		clConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+		clConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        clReleaseMemObject(m);
+    }
 
 	clReleaseMemObject(mem_expn);
 }
@@ -450,7 +410,6 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 
 	size_t channel_size = xsize * ysize * sizeof(float);
 
-	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 	ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size);
 
@@ -467,21 +426,14 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b);
 
 	size_t globalWorkSize[1] = { xsize * ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clOpsinDynamicsImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clOpsinDynamicsImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
 	ocl.releaseMemChannels(rgb_blurred);
 }
 
-
 void clMaskHighIntensityChangeEx(
     ocl_channels &xyb0/*in,out*/,
     ocl_channels &xyb1/*in,out*/,
@@ -500,7 +452,7 @@ void clMaskHighIntensityChangeEx(
 	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.r, c1.r, 0, 0, channel_size, 0, NULL, NULL);
 	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.g, c1.g, 0, 0, channel_size, 0, NULL, NULL);
 	clEnqueueCopyBuffer(ocl.commandQueue, xyb1.b, c1.b, 0, 0, channel_size, 0, NULL, NULL);
-	cl_int err = clFinish(ocl.commandQueue);
+	clFinish(ocl.commandQueue);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
 	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r);
@@ -517,16 +469,10 @@ void clMaskHighIntensityChangeEx(
 	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&c1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clMaskHighIntensityChangeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clMaskHighIntensityChangeEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
 	ocl.releaseMemChannels(c0);
 	ocl.releaseMemChannels(c1);
@@ -572,15 +518,9 @@ void clEdgeDetectorMapEx(
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize};
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clEdgeDetectorMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clEdgeDetectorMapEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
 	ocl.releaseMemChannels(rgb_blured);
 	ocl.releaseMemChannels(rgb2_blured);
@@ -616,15 +556,9 @@ void clBlockDiffMapEx(
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBlockDiffMapEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clBlockDiffMapEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clEdgeDetectorLowFreqEx(
@@ -666,15 +600,9 @@ void clEdgeDetectorLowFreqEx(
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clEdgeDetectorLowFreqEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clEdgeDetectorLowFreqEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
 	ocl.releaseMemChannels(rgb_blured);
 	ocl.releaseMemChannels(rgb2_blured);
@@ -685,7 +613,6 @@ void clDiffPrecomputeEx(
     const ocl_channels &xyb0, const ocl_channels &xyb1, 
     const size_t xsize, const size_t ysize)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
@@ -700,21 +627,14 @@ void clDiffPrecomputeEx(
 	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clDiffPrecomputeEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clDiffPrecomputeEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_double clscale = w;
@@ -724,16 +644,10 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 	clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&clscale);
 
 	size_t globalWorkSize[1] = { size };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clScaleImageEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clScaleImageEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize)
@@ -743,30 +657,24 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize
 	    return;
     }
 
-    cl_int err = CL_SUCCESS;
     ocl_args_d_t &ocl = getOcl();
 
     size_t len = xsize * ysize * sizeof(float);
-    ocl.allocA(len);
-    cl_mem img_org = ocl.srcA;
+    cl_mem img_org = ocl.allocMem(len);
 
-    err = clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL);
+    clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL);
 
     cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
     clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
     clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img_org);
 
     size_t globalWorkSize[2] = { xsize, ysize };
-    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-    if (CL_SUCCESS != err)
-    {
-    LogError("Error: clAverage5x5Ex() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-    }
+    cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
     err = clFinish(ocl.commandQueue);
-    if (CL_SUCCESS != err)
-    {
-    LogError("Error: clAverage5x5Ex() clFinish returned %s.\n", TranslateOpenCLError(err));
-    }
+    LOG_CL_RESULT(err);
+
+    clReleaseMemObject(img_org);
 }
 
 void clMinSquareValEx(
@@ -774,36 +682,26 @@ void clMinSquareValEx(
     const size_t xsize, const size_t ysize, 
     const size_t square_size, const size_t offset)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int cloffset = offset;
 	cl_int clsquare_size = square_size;
-	ocl.allocA(sizeof(cl_float) * xsize * ysize);
+	cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&ocl.srcA);
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
 	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
 	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
 	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clMinSquareValEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
-
-	err = clEnqueueCopyBuffer(ocl.commandQueue, ocl.srcA, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clMinSquareValEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
+	err = clEnqueueCopyBuffer(ocl.commandQueue, result, img, 0, 0, sizeof(cl_float) * xsize * ysize, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clMinSquareValEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
+    clReleaseMemObject(result);
 }
 
 static void MakeMask(double extmul, double extoff,
@@ -822,7 +720,6 @@ static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
 
 void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	double extmul = 0.975741017749;
@@ -922,16 +819,10 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&xyb_dc.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clDoMask() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clDoMask() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
 	ocl.releaseMemChannels(xyb);
 	ocl.releaseMemChannels(xyb_dc);
@@ -977,7 +868,6 @@ void clCombineChannelsEx(
 	const size_t res_xsize,
 	const size_t step)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
@@ -1005,21 +895,14 @@ void clCombineChannelsEx(
 	clSetKernelArg(kernel, 13, sizeof(cl_int), (void*)&clstep);
 
 	size_t globalWorkSize[2] = { work_xsize, work_ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clCombineChannelsEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clCombineChannelsEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysize, const int step)
 {
-	cl_int err = CL_SUCCESS;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int clxsize = xsize;
@@ -1039,29 +922,18 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi
 	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleSquareRootEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
-	err = clFinish(ocl.commandQueue);
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clEnqueueCopyBuffer(ocl.commandQueue, diffmap_out, diffmap, 0, 0, xsize * ysize * sizeof(float), 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleSquareRootEx() clEnqueueCopyBuffer returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clUpsampleSquareRootEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 
     clReleaseMemObject(diffmap_out);
 }
 
 void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const size_t ysize, const int step)
 {
-	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_int cls = 8 - step;
@@ -1075,21 +947,14 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz
 	clSetKernelArg(kernel, 4, sizeof(cl_int), &cls2);
 
 	size_t globalWorkSize[2] = { xsize - cls, ysize - cls};
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clCalculateDiffmapGetBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clCalculateDiffmapGetBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
 {
-	cl_int err = 0;
 	ocl_args_d_t &ocl = getOcl();
 
     cl_int cls = 8 - step;
@@ -1101,16 +966,10 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
 	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in);
 
 	size_t globalWorkSize[2] = { xsize, ysize};
-	err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clGetDiffmapFromBlurredEx() clEnqueueNDRangeKernel returned %s.\n", TranslateOpenCLError(err));
-	}
+	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: clGetDiffmapFromBlurredEx() clFinish returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
 }
 
 void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 2cadfb85..9cb4007d 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -311,9 +311,8 @@ void tclConvolution(size_t xsize, size_t ysize,
 	size_t inp_size = xsize * ysize * sizeof(float);
 	size_t multipliers_size = len * sizeof(float);
 	cl_int err = 0;
-	ocl_args_d_t &ocl = getOcl();
-	ocl.allocA(result_size);
-	cl_mem r = ocl.srcA;
+    ocl_args_d_t &ocl = getOcl();
+    cl_mem r = ocl.allocMem(result_size);
 	cl_mem i = ocl.allocMem(inp_size, inp);
 	cl_mem m = ocl.allocMem(multipliers_size, multipliers);
 
@@ -327,6 +326,7 @@ void tclConvolution(size_t xsize, size_t ysize,
 	clEnqueueUnmapMemObject(ocl.commandQueue, r, r_r, 0, NULL, NULL);
 	err = clFinish(ocl.commandQueue);
 
+    clReleaseMemObject(r);
 	clReleaseMemObject(i);
 	clReleaseMemObject(m);
 }
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 5eaba7e5..a445d930 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -183,17 +183,17 @@ void cuConvolutionXEx(
     const CUdeviceptr multipliers, size_t len,
     int xstep, int offset, double border_ratio)
 {
-    ocu_args_d_t &ocu = getOcu();
+    ocu_args_d_t &ocl = getOcu();
 
     const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONX],
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX],
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocu.stream, (void**)args, NULL);
+        ocl.stream, (void**)args, NULL);
 
-    err = cuStreamSynchronize(ocu.stream);
+    err = cuStreamSynchronize(ocl.stream);
 }
 
 void cuConvolutionYEx(
@@ -203,17 +203,17 @@ void cuConvolutionYEx(
     int xstep, int offset, double border_ratio)
 {
     CUresult err = CUDA_SUCCESS;
-    ocu_args_d_t &ocu = getOcu();
+    ocu_args_d_t &ocl = getOcu();
 
     const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    err = cuLaunchKernel(ocu.kernel[KERNEL_CONVOLUTIONY],
+    err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY],
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocu.stream, (void**)args, NULL);
+        ocl.stream, (void**)args, NULL);
 
-    err = cuStreamSynchronize(ocu.stream);
+    err = cuStreamSynchronize(ocl.stream);
 }
 
 void cuSquareSampleEx(
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index d92fb1a4..5218ce9b 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -15,16 +15,7 @@ ocl_args_d_t::ocl_args_d_t() :
 	program(NULL),
 	platformVersion(OPENCL_VERSION_1_2),
 	deviceVersion(OPENCL_VERSION_1_2),
-	compilerVersion(OPENCL_VERSION_1_2),
-	srcA(NULL),
-	srcB(NULL),
-	dstMem(NULL),
-	inputA(NULL),
-	lenA(0),
-	inputB(NULL),
-	lenB(0),
-	outputC(NULL),
-	lenC(0)
+	compilerVersion(OPENCL_VERSION_1_2)
 {
 	for (int i = 0; i < KERNEL_COUNT; i++)
 	{
@@ -72,30 +63,6 @@ ocl_args_d_t::~ocl_args_d_t()
 			LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
 		}
 	}
-	if (srcA)
-	{
-		err = clReleaseMemObject(srcA);
-		if (CL_SUCCESS != err)
-		{
-			LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
-		}
-	}
-	if (srcB)
-	{
-		err = clReleaseMemObject(srcB);
-		if (CL_SUCCESS != err)
-		{
-			LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
-		}
-	}
-	if (dstMem)
-	{
-		err = clReleaseMemObject(dstMem);
-		if (CL_SUCCESS != err)
-		{
-			LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
-		}
-	}
 	if (commandQueue)
 	{
 		err = clReleaseCommandQueue(commandQueue);
@@ -120,118 +87,30 @@ ocl_args_d_t::~ocl_args_d_t()
 			LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
 		}
 	}
-
-	/*
-	* Note there is no procedure to deallocate platform
-	* because it was not created at the startup,
-	* but just queried from OpenCL runtime.
-	*/
-
-	if (inputA) _aligned_free(inputA);
-	if (inputB) _aligned_free(inputB);
-	if (outputC) _aligned_free(outputC);
-}
-
-void* ocl_args_d_t::allocA(size_t s)
-{
-	if (s <= lenA) return inputA;
-	lenA = 0;
-	_aligned_free(inputA);
-	clReleaseMemObject(srcA);
-
-	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
-	inputA = _aligned_malloc(optimizedSize, 4096);
-	lenA = s;
-
-	cl_int err = 0;
-	srcA = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputA, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocA() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
-
-	return inputA;
-}
-
-void* ocl_args_d_t::allocB(size_t s)
-{
-	if (s <= lenB) return inputB;
-	lenB = 0;
-	_aligned_free(inputB);
-	clReleaseMemObject(srcB);
-
-	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
-	inputB = _aligned_malloc(optimizedSize, 4096);
-	lenB = s;
-
-	cl_int err = 0;
-	srcB = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, inputB, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
-
-	return inputB;
-}
-
-void* ocl_args_d_t::allocC(size_t s)
-{
-	if (s <= lenC) return outputC;
-	lenC = 0;
-	_aligned_free(outputC);
-	clReleaseMemObject(dstMem);
-
-	cl_uint optimizedSize = ((s - 1) / 64 + 1) * 64;
-	outputC = _aligned_malloc(optimizedSize, 4096);
-	lenC = s;
-	
-	cl_int err = 0;
-	dstMem = clCreateBuffer(this->context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, s, outputC, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocB() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
-
-	return outputC;
 }
 
 cl_mem ocl_args_d_t::allocMem(size_t s, const void *init)
 {
 	cl_int err = 0;
 	cl_mem mem = clCreateBuffer(this->context, CL_MEM_READ_WRITE, s, nullptr, &err);
-	if (CL_SUCCESS != err)
-	{
-		LogError("Error: allocMem() for buffer returned %s.\n", TranslateOpenCLError(err));
-	}
+    LOG_CL_RESULT(err);
     if (!mem) return NULL;
     
     // init memory
     if (init)
     {
         err = clEnqueueWriteBuffer(this->commandQueue, mem, CL_FALSE, 0, s, init, 0, NULL, NULL);
-        if (CL_SUCCESS != err)
-        {
-            LogError("Error: allocMem() clEnqueueWriteBuffer return %s.\n", TranslateOpenCLError(err));
-        }
+        LOG_CL_RESULT(err);
         err = clFinish(this->commandQueue);
-        if (CL_SUCCESS != err)
-        {
-            LogError("Error: allocMem() clEnqueueWriteBuffer/clFinish return %s.\n", TranslateOpenCLError(err));
-        }
+        LOG_CL_RESULT(err);
     }
     else
     {
         cl_char cc = 0;
         err = clEnqueueFillBuffer(this->commandQueue, mem, &cc, sizeof(cc), 0, s / sizeof(cc), 0, NULL, NULL);
-        if (CL_SUCCESS != err)
-        {
-            LogError("Error: allocMem() clEnqueueFillBuffer return %s.\n", TranslateOpenCLError(err));
-        }
+        LOG_CL_RESULT(err);
         err = clFinish(this->commandQueue);
-        if (CL_SUCCESS != err)
-        {
-            LogError("Error: allocMem() clEnqueueFillBuffer/clFinish return %s.\n", TranslateOpenCLError(err));
-        }
+        LOG_CL_RESULT(err);
     }
 
 	return mem;
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index fd7e78e7..13eb232b 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -67,15 +67,13 @@ enum KernelName {
 	KERNEL_COUNT,
 };
 
+#define LOG_CL_RESULT(e)   if (CL_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateOpenCLError((e)));}
+
 struct ocl_args_d_t
 {
 	ocl_args_d_t();
 	~ocl_args_d_t();
 
-	void* allocA(size_t s);
-	void* allocB(size_t s);
-	void* allocC(size_t s);
-
 	cl_mem allocMem(size_t s, const void *init = NULL);
 	ocl_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
     void releaseMemChannels(ocl_channels &rgb);
@@ -89,19 +87,5 @@ struct ocl_args_d_t
 	float            platformVersion;   // hold the OpenCL platform version (default 1.2)
 	float            deviceVersion;     // hold the OpenCL device version (default. 1.2)
 	float            compilerVersion;   // hold the device OpenCL C version (default. 1.2)
-
-										// Objects that are specific for algorithm implemented in this sample
-	cl_mem           srcA;              // hold first source buffer
-	cl_mem           srcB;              // hold second source buffer
-	cl_mem           dstMem;            // hold destination buffer
-
-	void*			 inputA;
-	size_t		     lenA;
-
-	void*			 inputB;
-	size_t			 lenB;
-
-	void*			 outputC;
-	size_t			 lenC;
 };
 

From 18f9672660fbcebd91f4a56ba02095129f140bd9 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 3 Jun 2017 14:39:44 +0800
Subject: [PATCH 128/189] =?UTF-8?q?=E8=B0=83=E6=95=B4cu=E7=BC=96=E8=AF=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 compile.bat | 159 +---------------------------------------------------
 1 file changed, 2 insertions(+), 157 deletions(-)

diff --git a/compile.bat b/compile.bat
index 8aa9430f..05a3a361 100644
--- a/compile.bat
+++ b/compile.bat
@@ -1,159 +1,4 @@
-@if "%1" == "" goto start
-@setlocal
-@set userinput=%1
-@if not "%1"=="store" @if not "%1"=="8.1" @if not "%userinput:~0,3%"=="10." goto usage
-@endlocal
-
-:start
-@call :GetVSCommonToolsDir
-@if "%VS140COMNTOOLS%"=="" goto error_no_VS140COMNTOOLSDIR
-
-@call "%VS140COMNTOOLS%VCVarsQueryRegistry.bat" No32bit 64bit %1 %2
-
-@if "%VSINSTALLDIR%"=="" goto error_no_VSINSTALLDIR
-@if "%VCINSTALLDIR%"=="" goto error_no_VCINSTALLDIR
-@if "%FrameworkDir64%"=="" goto error_no_FrameworkDIR64
-@if "%FrameworkVersion64%"=="" goto error_no_FrameworkVer64
-@if "%Framework40Version%"=="" goto error_no_Framework40Version
-
-@set FrameworkDir=%FrameworkDir64%
-@set FrameworkVersion=%FrameworkVersion64%
-
-@if not "%WindowsSDK_ExecutablePath_x64%" == "" @set PATH=%WindowsSDK_ExecutablePath_x64%;%PATH%
-
-@rem
-@rem Set Windows SDK include/lib path
-@rem
-@if not "%WindowsSdkDir%" == "" @set PATH=%WindowsSdkDir%bin\x64;%WindowsSdkDir%bin\x86;%PATH%
-@if not "%WindowsSdkDir%" == "" @set INCLUDE=%WindowsSdkDir%include\%WindowsSDKVersion%shared;%WindowsSdkDir%include\%WindowsSDKVersion%um;%WindowsSdkDir%include\%WindowsSDKVersion%winrt;%INCLUDE%
-@if not "%WindowsSdkDir%" == "" @set LIB=%WindowsSdkDir%lib\%WindowsSDKLibVersion%um\x64;%LIB%
-@if not "%WindowsSdkDir%" == "" @set LIBPATH=%WindowsLibPath%;%ExtensionSDKDir%\Microsoft.VCLibs\14.0\References\CommonConfiguration\neutral;%LIBPATH%
-
-@REM Set NETFXSDK include/lib path
-@if not "%NETFXSDKDir%" == "" @set INCLUDE=%NETFXSDKDir%include\um;%INCLUDE%
-@if not "%NETFXSDKDir%" == "" @set LIB=%NETFXSDKDir%lib\um\x64;%LIB%
-
-@rem
-@rem Set UniversalCRT include/lib path, the default is the latest installed version.
-@rem
-@if not "%UCRTVersion%" == "" @set INCLUDE=%UniversalCRTSdkDir%include\%UCRTVersion%\ucrt;%INCLUDE%
-@if not "%UCRTVersion%" == "" @set LIB=%UniversalCRTSdkDir%lib\%UCRTVersion%\ucrt\x64;%LIB%
-
-@rem PATH
-@rem ----
-@if exist "%VSINSTALLDIR%Team Tools\Performance Tools\x64" @set PATH=%VSINSTALLDIR%Team Tools\Performance Tools\x64;%VSINSTALLDIR%Team Tools\Performance Tools;%PATH%
-
-@if exist "%ProgramFiles%\HTML Help Workshop" set PATH=%ProgramFiles%\HTML Help Workshop;%PATH%
-@if exist "%ProgramFiles(x86)%\HTML Help Workshop" set PATH=%ProgramFiles(x86)%\HTML Help Workshop;%PATH%
-@if exist "%VSINSTALLDIR%Common7\Tools" set PATH=%VSINSTALLDIR%Common7\Tools;%PATH%
-@if exist "%VSINSTALLDIR%Common7\IDE" set PATH=%VSINSTALLDIR%Common7\IDE;%PATH%
-@if exist "%VCINSTALLDIR%VCPackages" set PATH=%VCINSTALLDIR%VCPackages;%PATH%
-@if exist "%FrameworkDir%\%Framework40Version%" set PATH=%FrameworkDir%\%Framework40Version%;%PATH%
-@if exist "%FrameworkDir%\%FrameworkVersion%" set PATH=%FrameworkDir%\%FrameworkVersion%;%PATH%
-@if exist "%VCINSTALLDIR%BIN\amd64" set PATH=%VCINSTALLDIR%BIN\amd64;%PATH%
-
-@rem Add path to MSBuild Binaries
-@if exist "%ProgramFiles%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles%\MSBuild\14.0\bin\amd64;%PATH%
-@if exist "%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64" set PATH=%ProgramFiles(x86)%\MSBuild\14.0\bin\amd64;%PATH%
-
-@if exist "%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow" @set PATH=%VSINSTALLDIR%Common7\IDE\CommonExtensions\Microsoft\TestWindow;%PATH%
-
-@rem INCLUDE
-@rem -------
-@if exist "%VCINSTALLDIR%ATLMFC\INCLUDE" set INCLUDE=%VCINSTALLDIR%ATLMFC\INCLUDE;%INCLUDE%
-@if exist "%VCINSTALLDIR%INCLUDE" set INCLUDE=%VCINSTALLDIR%INCLUDE;%INCLUDE%
-
-@rem LIB
-@rem ---
-@if "%1" == "store" goto setstorelib
-@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIB=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIB%
-@if exist "%VCINSTALLDIR%LIB\amd64" set LIB=%VCINSTALLDIR%LIB\amd64;%LIB%
-@goto setlibpath
-:setstorelib
-@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIB=%VCINSTALLDIR%LIB\store\amd64;%LIB%
-
-:setlibpath
-@rem LIBPATH
-@rem -------
-@if "%1" == "store" goto setstorelibpath
-@if exist "%VCINSTALLDIR%ATLMFC\LIB\amd64" set LIBPATH=%VCINSTALLDIR%ATLMFC\LIB\amd64;%LIBPATH%
-@if exist "%VCINSTALLDIR%LIB\amd64" set LIBPATH=%VCINSTALLDIR%LIB\amd64;%LIBPATH%
-@goto appendlibpath
-:setstorelibpath
-@if exist "%VCINSTALLDIR%LIB\store\amd64" set LIBPATH=%VCINSTALLDIR%LIB\store\amd64;%VCINSTALLDIR%LIB\store\references;%LIBPATH%
-:appendlibpath
-@if exist "%FrameworkDir%\%Framework40Version%" set LIBPATH=%FrameworkDir%\%Framework40Version%;%LIBPATH%
-@if exist "%FrameworkDir%\%FrameworkVersion%" set LIBPATH=%FrameworkDir%\%FrameworkVersion%;%LIBPATH%
-
-@set Platform=X64
-@set CommandPromptType=Native
-
-@goto end
-
-@REM -----------------------------------------------------------------------
-:GetVSCommonToolsDir
-@set VS140COMNTOOLS=
-@call :GetVSCommonToolsDirHelper32 HKLM > nul 2>&1
-@if errorlevel 1 call :GetVSCommonToolsDirHelper32 HKCU > nul 2>&1
-@if errorlevel 1 call :GetVSCommonToolsDirHelper64  HKLM > nul 2>&1
-@if errorlevel 1 call :GetVSCommonToolsDirHelper64  HKCU > nul 2>&1
-@exit /B 0
-
-:GetVSCommonToolsDirHelper32
-@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO (
-	@if "%%i"=="14.0" (
-		@SET VS140COMNTOOLS=%%k
-	)
-)
-@if "%VS140COMNTOOLS%"=="" exit /B 1
-@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\
-@exit /B 0
-
-:GetVSCommonToolsDirHelper64
-@for /F "tokens=1,2*" %%i in ('reg query "%1\SOFTWARE\Wow6432Node\Microsoft\VisualStudio\SxS\VS7" /v "14.0"') DO (
-	@if "%%i"=="14.0" (
-		@SET VS140COMNTOOLS=%%k
-	)
-)
-@if "%VS140COMNTOOLS%"=="" exit /B 1
-@SET VS140COMNTOOLS=%VS140COMNTOOLS%Common7\Tools\
-@exit /B 0
-
-@REM -----------------------------------------------------------------------
-:error_no_VS140COMNTOOLSDIR
-@echo ERROR: Cannot determine the location of the VS Common Tools folder.
-@goto end
-
-:error_no_VSINSTALLDIR
-@echo ERROR: Cannot determine the location of the VS installation.
-@goto end
-
-:error_no_VCINSTALLDIR
-@echo ERROR: Cannot determine the location of the VC installation.
-@goto end
-
-:error_no_FrameworkDIR64
-@echo ERROR: Cannot determine the location of the .NET Framework 64bit installation.
-@goto end
-
-:error_no_FrameworkVer64
-@echo ERROR: Cannot determine the version of the .NET Framework 64bit installation.
-@goto end
-
-:error_no_Framework40Version
-@echo ERROR: Cannot determine the .NET Framework 4.0 version.
-@goto end
-
-:usage
-echo Error in script usage. The correct usage is:
-echo     %0
-echo   or
-echo     %0 store
-echo   or
-echo     %0 10.0.10240.0
-echo   or
-echo     %0 store 10.0.10240.0
-
-:end
+@rem setupt windows var
+call vcvars64.bat
 
 nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine 64 -G -g -ptx -o clguetzli\clguetzli.cu.ptx64 clguetzli\clguetzli.cu
\ No newline at end of file

From 601e367ab6c4411f7cd57eadc5158327a005380e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sat, 3 Jun 2017 14:40:48 +0800
Subject: [PATCH 129/189] =?UTF-8?q?CUDA=E7=BC=96=E8=AF=91=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E5=AE=8F=E5=BC=80=E5=85=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp | 5 ++++-
 clguetzli/cuguetzli.cpp           | 3 +++
 clguetzli/cuguetzli.h             | 4 ++++
 clguetzli/ocu.cpp                 | 4 ++++
 clguetzli/ocu.h                   | 6 +++++-
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 53cd89fb..178e70e9 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -177,6 +177,7 @@ namespace butteraugli
                 );
             return;
         }
+#ifdef __USE_CUDA__
         else if (g_useCuda && xsize > 100 && ysize > 100)
         {
             mask->resize(3);
@@ -194,7 +195,7 @@ namespace butteraugli
             );
             return;
         }
-
+#endif
         _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
 
         if (g_checkOpenCL && xsize > 8 && ysize > 8)
@@ -305,6 +306,7 @@ namespace butteraugli
 
             clOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
+#ifdef __USE_CUDA__
         else if (g_useCuda && xsize > 100 && ysize > 100)
         {
             float * r = rgb[0].data();
@@ -313,6 +315,7 @@ namespace butteraugli
 
             cuOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
+#endif
         else
         {
             std::vector< std::vector<float>> orig_rgb;
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index a445d930..ec158691 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -2,6 +2,8 @@
 #include <algorithm>
 #include "ocu.h"
 
+#ifdef __USE_CUDA__
+
 void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -799,3 +801,4 @@ void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, con
     cuMemFree(blurred);
 }
 
+#endif
\ No newline at end of file
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
index 0783796a..e9dddde6 100644
--- a/clguetzli/cuguetzli.h
+++ b/clguetzli/cuguetzli.h
@@ -2,6 +2,8 @@
 #include "guetzli/processor.h"
 #include "clguetzli.cl.h"
 
+#ifdef __USE_CUDA__
+
 void cuOpsinDynamicsImage(
 	float *r, float *g, float *b, 
 	const size_t xsize, const size_t ysize);
@@ -113,3 +115,5 @@ void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize,
 void cuAddBorderEx(CUdeviceptr out, const size_t xsize, const size_t ysize, const int step, const CUdeviceptr in);
 
 void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index c99d6ea9..6fbf58ee 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -2,6 +2,8 @@
 #include <nvrtc.h>
 #include "ocu.h"
 
+#ifdef __USE_CUDA__
+
 ocu_args_d_t& getOcu(void)
 {
     static bool bInit = false;
@@ -148,3 +150,5 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb)
         rgb.ch[i] = NULL;
     }
 }
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index 63a4bb47..4c34edaf 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#ifdef __USE_CUDA__
+
 #include <cuda.h>
 #include "ocl.h"
 
@@ -21,4 +23,6 @@ struct ocu_args_d_t
     CUmodule    mod;
     CUcontext   ctxt;
     CUdevice    dev;
-};
\ No newline at end of file
+};
+
+#endif
\ No newline at end of file

From c0bab473191af8bc97762a9bc622c69ddad608d0 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 4 Jun 2017 14:01:12 +0800
Subject: [PATCH 130/189] =?UTF-8?q?=E4=BC=98=E5=8C=96clSetKernelArg?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/cl.hpp        | 318 ++++++++++++++++++++++++++++++++++++++++
 clguetzli/clguetzli.cpp | 263 ++++++++-------------------------
 clguetzli/clguetzli.h   |   6 +-
 clguetzli/cuguetzli.cpp |  39 +++--
 4 files changed, 408 insertions(+), 218 deletions(-)
 create mode 100644 clguetzli/cl.hpp

diff --git a/clguetzli/cl.hpp b/clguetzli/cl.hpp
new file mode 100644
index 00000000..8be6313e
--- /dev/null
+++ b/clguetzli/cl.hpp
@@ -0,0 +1,318 @@
+#pragma once
+
+template<typename T>
+inline void clSetKernelArgK(cl_kernel k, int idx, T* t)
+{
+    clSetKernelArg(k, idx, sizeof(T), t);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, int* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, const int* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, size_t* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<>
+inline void clSetKernelArgK(cl_kernel k, int idx, const size_t* t)
+{
+    cl_int c = *t;
+    clSetKernelArg(k, idx, sizeof(cl_int), &c);
+}
+
+template<typename T0>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0)
+{
+    clSetKernelArgK(k, 0, t0);
+}
+
+template<typename T0, typename T1>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1)
+{
+    clSetKernelArgK(k, 1, t1);
+    clSetKernelArgEx(k, t0);
+}
+
+template<typename T0, typename T1, typename T2>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2)
+{
+    clSetKernelArgK(k, 2, t2);
+    clSetKernelArgEx(k, t0, t1);
+}
+
+template<typename T0, typename T1, typename T2, typename T3>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3)
+{
+    clSetKernelArgK(k, 3, t3);
+    clSetKernelArgEx(k, t0, t1, t2);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+{
+    clSetKernelArgK(k, 4, t4);
+    clSetKernelArgEx(k, t0, t1, t2, t3);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+{
+    clSetKernelArgK(k, 5, t5);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+{
+    clSetKernelArgK(k, 6, t6);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+{
+    clSetKernelArgK(k, 7, t7);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4, 
+         typename T5, typename T6, typename T7, typename T8>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+{
+    clSetKernelArgK(k, 8, t8);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+         typename T5, typename T6, typename T7, typename T8, typename T9>
+inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+{
+    clSetKernelArgK(k, 9, t9);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10)
+{
+    clSetKernelArgK(k, 10, t10);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, T10* t10, T11* t11)
+{
+    clSetKernelArgK(k, 11, t11);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, 
+          T5* t5, T6* t6, T7* t7, T8* t8, T9* t9, 
+          T10* t10, T11* t11, T12* t12)
+{
+    clSetKernelArgK(k, 12, t12);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13)
+{
+    clSetKernelArgK(k, 13, t13);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12);
+}
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14>
+    inline void clSetKernelArgEx(cl_kernel k, T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13,
+        T14* t14)
+{
+    clSetKernelArgK(k, 14, t14);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15)
+{
+    clSetKernelArgK(k, 15, t15);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16)
+{
+    clSetKernelArgK(k, 16, t16);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17)
+{
+    clSetKernelArgK(k, 17, t17);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18)
+{
+    clSetKernelArgK(k, 18, t18);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19)
+{
+    clSetKernelArgK(k, 19, t19);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20)
+{
+    clSetKernelArgK(k, 20, t20);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21)
+{
+    clSetKernelArgK(k, 21, t21);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21, typename T22>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21, T22* t22)
+{
+    clSetKernelArgK(k, 22, t22);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21, typename T22, typename T23>
+    inline void clSetKernelArgEx(cl_kernel k,
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14,
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21, T22* t22, T23* t23)
+{
+    clSetKernelArgK(k, 23, t23);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22);
+}
+
+template<
+    typename T0, typename T1, typename T2, typename T3, typename T4,
+    typename T5, typename T6, typename T7, typename T8, typename T9,
+    typename T10, typename T11, typename T12, typename T13, typename T14,
+    typename T15, typename T16, typename T17, typename T18, typename T19,
+    typename T20, typename T21, typename T22, typename T23, typename T24>
+inline void clSetKernelArgEx(cl_kernel k, 
+        T0* t0, T1* t1, T2* t2, T3* t3, T4* t4,
+        T5* t5, T6* t6, T7* t7, T8* t8, T9* t9,
+        T10* t10, T11* t11, T12* t12, T13* t13, T14* t14, 
+        T15* t15, T16* t16, T17* t17, T18* t18, T19* t19,
+        T20* t20, T21* t21, T22* t22, T23* t23, T24* t24)
+{
+    clSetKernelArgK(k, 24, t24);
+    clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23);
+}
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index c6c5eb4f..53be7348 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -2,6 +2,7 @@
 #include <math.h>
 #include <algorithm>
 #include <vector>
+#include "cl.hpp"
 
 extern bool g_useOpenCL = false;
 extern bool g_useCuda = false;
@@ -172,33 +173,18 @@ void clComputeBlockZeroingOrder(
 
     int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
     cl_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
-    cl_float clBlockErrorLimit = BlockErrorLimit;
-    cl_int clWidth = image_width;
-    cl_int clHeight = image_height;
-    cl_int clFactor = factor;
-    cl_int clMask = comp_mask;
 
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mem_orig_coeff[0]);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mem_orig_coeff[1]);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mem_orig_coeff[2]);
-    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mem_orig_image);
-    clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mem_mask_scale);
-    clSetKernelArg(kernel, 5, sizeof(cl_int), &clWidth);
-    clSetKernelArg(kernel, 6, sizeof(cl_int), &clHeight);
-    clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&mem_mayout_coeff[0]);
-    clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&mem_mayout_coeff[1]);
-    clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&mem_mayout_coeff[2]);
-    clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&mem_mayout_pixel[0]);
-    clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&mem_mayout_pixel[1]);
-    clSetKernelArg(kernel, 12, sizeof(cl_mem), (void*)&mem_mayout_pixel[2]);
-    clSetKernelArg(kernel, 13, sizeof(channel_info), &mayout_channel[0]);
-    clSetKernelArg(kernel, 14, sizeof(channel_info), &mayout_channel[1]);
-    clSetKernelArg(kernel, 15, sizeof(channel_info), &mayout_channel[2]);
-    clSetKernelArg(kernel, 16, sizeof(cl_int), &clFactor);
-    clSetKernelArg(kernel, 17, sizeof(cl_int), &clMask);
-    clSetKernelArg(kernel, 18, sizeof(cl_float), &clBlockErrorLimit);
-    clSetKernelArg(kernel, 19, sizeof(cl_mem), &mem_output_order_batch);
+    clSetKernelArgEx(kernel, &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
+                        &mem_orig_image, &mem_mask_scale, 
+                        &image_width, &image_height,
+                        &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
+                        &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
+                        &mayout_channel[0], &mayout_channel[1], &mayout_channel[2],
+                        &factor, 
+						&comp_mask, 
+						&BlockErrorLimit, 
+						&mem_output_order_batch);
 
     size_t globalWorkSize[2] = { blockf_width, blockf_height };
     err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -224,7 +210,7 @@ void clComputeBlockZeroingOrder(
 void clMask(
     float* mask_r,  float* mask_g,    float* mask_b,
     float* maskdc_r, float* maskdc_g, float* maskdc_b,
-    size_t xsize, size_t ysize,
+    const size_t xsize, const size_t ysize,
     const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2)
 {
@@ -257,27 +243,14 @@ void clConvolutionEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
+    int xstep, int offset, float border_ratio)
 {
 	ocl_args_d_t &ocl = getOcl();
 
 	size_t oxsize = (xsize + xstep - 1) / xstep;
 
-	cl_int clxsize = xsize;
-	cl_int clxstep = xstep;
-	cl_int cllen = len;
-	cl_int cloffset = offset;
-	cl_float clborder_ratio = border_ratio;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
-    clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxsize);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&multipliers);
-    clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(kernel, 7, sizeof(cl_float), (void*)&clborder_ratio);
+    clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { oxsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -290,23 +263,12 @@ void clConvolutionXEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
 	const cl_mem multipliers, size_t len,
-	int xstep, int offset, double border_ratio)
+	int xstep, int offset, float border_ratio)
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clxstep = xstep;
-	cl_int cllen = len;
-	cl_int cloffset = offset;
-	cl_float clborder_ratio = border_ratio;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers);
-    clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
+    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -319,23 +281,12 @@ void clConvolutionYEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
 	const cl_mem multipliers, size_t len,
-	int xstep, int offset, double border_ratio)
+	int xstep, int offset, float border_ratio)
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clxstep = xstep;
-	cl_int cllen = len;
-	cl_int cloffset = offset;
-	cl_float clborder_ratio = border_ratio;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&inp);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&multipliers);
-    clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cllen);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&xstep);
-	clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&cloffset);
-	clSetKernelArg(kernel, 6, sizeof(cl_float), (void*)&clborder_ratio);
+    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -351,13 +302,8 @@ void clSquareSampleEx(
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clxstep = xstep;
-	cl_int clystep = ystep;
 	cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&image);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clxstep);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&clystep);
+    clSetKernelArgEx(kernel, &result, &image, &xstep, &ystep);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -418,12 +364,7 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&rgb.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&rgb.g);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&rgb.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&rgb_blurred.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&rgb_blurred.g);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&rgb_blurred.b);
+    clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
 
 	size_t globalWorkSize[1] = { xsize * ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -455,18 +396,10 @@ void clMaskHighIntensityChangeEx(
 	clFinish(ocl.commandQueue);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&xyb0.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&xyb0.g);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&xyb0.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb1.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb1.g);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb1.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&c0.r);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&c0.g);
-	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&c0.b);
-	clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&c1.r);
-	clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&c1.g);
-	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&c1.b);
+    clSetKernelArgEx(kernel, &xyb0.r, &xyb0.g, &xyb0.b,
+                            &xyb1.r, &xyb1.g, &xyb1.b,
+                            &c0.r, &c0.g, &c0.b,
+                            &c1.r, &c1.g, &c1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -497,21 +430,11 @@ void clEdgeDetectorMapEx(
 		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
 	}
 
-	cl_int clxsize = xsize;
-	cl_int clysize = ysize;
-	cl_int clstep = step;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &result);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b);
-	clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize);
-	clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize);
-	clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep);
+    clSetKernelArgEx(kernel, &result,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -534,22 +457,11 @@ void clBlockDiffMapEx(
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clxsize = xsize;
-	cl_int clysize = ysize;
-	cl_int clstep = step;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_dc);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), &block_diff_ac);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb.r);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb.g);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb.b);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2.r);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2.g);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), &rgb2.b);
-	clSetKernelArg(kernel, 8, sizeof(cl_int), &clxsize);
-	clSetKernelArg(kernel, 9, sizeof(cl_int), &clysize);
-	clSetKernelArg(kernel, 10, sizeof(cl_int), &clstep);
+    clSetKernelArgEx(kernel, &block_diff_dc, &block_diff_ac,
+        &rgb.r, &rgb.g, &rgb.b,
+        &rgb2.r, &rgb2.g, &rgb2.b,
+        &xsize, &ysize, &step);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -579,21 +491,11 @@ void clEdgeDetectorLowFreqEx(
 		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
 	}
 
-	cl_int clxsize = xsize;
-	cl_int clysize = ysize;
-	cl_int clstep = step;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_diff_ac);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), &rgb_blured.r);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), &rgb_blured.g);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), &rgb_blured.b);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), &rgb2_blured.r);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), &rgb2_blured.g);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), &rgb2_blured.b);
-	clSetKernelArg(kernel, 7, sizeof(cl_int), &clxsize);
-	clSetKernelArg(kernel, 8, sizeof(cl_int), &clysize);
-	clSetKernelArg(kernel, 9, sizeof(cl_int), &clstep);
+    clSetKernelArgEx(kernel, &block_diff_ac,
+        &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
+        &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
+        &xsize, &ysize, &step);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -616,15 +518,9 @@ void clDiffPrecomputeEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.x);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.y);
-    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&xyb0.x);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&xyb0.y);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&xyb0.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb1.x);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb1.y);
-	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb1.b);
+    clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, 
+                            &xyb0.x, &xyb0.y, &xyb0.b,
+        &xyb1.x, &xyb1.y, &xyb1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -637,11 +533,8 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_double clscale = w;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
-	clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&clscale);
+    clSetKernelArgEx(kernel, &img, &w);
 
 	size_t globalWorkSize[1] = { size };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -665,8 +558,7 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize
     clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL);
 
     cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&img);
-    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img_org);
+    clSetKernelArgEx(kernel, &img, &img_org);
 
     size_t globalWorkSize[2] = { xsize, ysize };
     cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -684,15 +576,10 @@ void clMinSquareValEx(
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int cloffset = offset;
-	cl_int clsquare_size = square_size;
 	cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&img);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&clsquare_size);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&cloffset);
+    clSetKernelArgEx(kernel, &result, &img, &square_size, &offset);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -805,18 +692,10 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
     ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DOMASK];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&mask.r);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.g);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.b);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask_dc.r);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.g);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.b);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&xyb.x);
-	clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&xyb.y);
-	clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*)&xyb.b);
-	clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&xyb_dc.x);
-	clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&xyb_dc.y);
-	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&xyb_dc.b);
+    clSetKernelArgEx(kernel, &mask.r, &mask.g, &mask.b,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b,
+        &xyb.x, &xyb.y, &xyb.b,
+        &xyb_dc.x, &xyb_dc.y, &xyb_dc.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -873,26 +752,15 @@ void clCombineChannelsEx(
 	const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
 	const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
 
-	cl_int clres_size = res_xsize;
-	cl_int clxsize = xsize;
-	cl_int clysize = ysize;
-	cl_int clstep = step;
-
 	cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&result);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&mask.r);
-	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&mask.g);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&mask.b);
-	clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&mask_dc.r);
-	clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&mask_dc.g);
-	clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&mask_dc.b);
-    clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&clxsize);
-    clSetKernelArg(kernel, 8, sizeof(cl_int), (void*)&clysize);
-	clSetKernelArg(kernel, 9, sizeof(cl_mem), (void*)&block_diff_dc);
-	clSetKernelArg(kernel, 10, sizeof(cl_mem), (void*)&block_diff_ac);
-	clSetKernelArg(kernel, 11, sizeof(cl_mem), (void*)&edge_detector_map);
-	clSetKernelArg(kernel, 12, sizeof(cl_int), (void*)&clres_size);
-	clSetKernelArg(kernel, 13, sizeof(cl_int), (void*)&clstep);
+    clSetKernelArgEx(kernel, &result, 
+                            &mask.r, &mask.g, &mask.b,
+        &mask_dc.r, &mask_dc.g, &mask_dc.b, 
+        &xsize, &ysize,
+        &block_diff_dc, &block_diff_ac,
+        &edge_detector_map,
+        &res_xsize,
+        &step);
 
 	size_t globalWorkSize[2] = { work_xsize, work_ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -905,18 +773,10 @@ void clUpsampleSquareRootEx(cl_mem diffmap, const size_t xsize, const size_t ysi
 {
 	ocl_args_d_t &ocl = getOcl();
 
-	cl_int clxsize = xsize;
-	cl_int clysize = ysize;
-	cl_int clstep = step;
-
     cl_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
 
 	cl_kernel kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&diffmap_out);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&diffmap);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&xsize);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&ysize);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&step);
+    clSetKernelArgEx(kernel, &diffmap_out, &diffmap, &xsize, &ysize, &step);
 
 	const size_t res_xsize = (xsize + step - 1) / step;
 	const size_t res_ysize = (ysize + step - 1) / step;
@@ -938,13 +798,9 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz
 
 	cl_int cls = 8 - step;
 	cl_int cls2 = (8 - step) / 2;
-    cl_int clxsize = xsize;
+
 	cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER];
-    clSetKernelArg(kernel, 0, sizeof(cl_mem), &out);
-	clSetKernelArg(kernel, 1, sizeof(cl_mem), &in);
-    clSetKernelArg(kernel, 2, sizeof(cl_int), &clxsize);
-	clSetKernelArg(kernel, 3, sizeof(cl_int), &cls);
-	clSetKernelArg(kernel, 4, sizeof(cl_int), &cls2);
+    clSetKernelArgEx(kernel, &out, &in, &xsize, &cls, &cls2);
 
 	size_t globalWorkSize[2] = { xsize - cls, ysize - cls};
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -960,10 +816,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
     cl_int cls = 8 - step;
     cl_int cls2 = (8 - step) / 2;
 	cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER];
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&out);
-	clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&cls);
-	clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&cls2);
-	clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&in);
+    clSetKernelArgEx(kernel, &out, &cls, &cls2, &in);
 
 	size_t globalWorkSize[2] = { xsize, ysize};
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 31b10e36..ccdf24a8 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -45,19 +45,19 @@ void clConvolutionEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
-    int xstep, int offset, double border_ratio);
+    int xstep, int offset, float border_ratio);
 
 void clConvolutionXEx(
     cl_mem result/*out*/, 
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
-    int xstep, int offset, double border_ratio);
+    int xstep, int offset, float border_ratio);
 
 void clConvolutionYEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
     const cl_mem multipliers, size_t len,
-    int xstep, int offset, double border_ratio);
+    int xstep, int offset, float border_ratio);
 
 void clSquareSampleEx(
     cl_mem result/*out*/,
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index ec158691..ced35da7 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -115,7 +115,7 @@ void cuComputeBlockZeroingOrder(
     CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
 
     const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
-        &mem_orig_image, &mem_orig_image, &mem_mask_scale,
+        &mem_orig_image, &mem_mask_scale,
         &image_width, &image_height,
         &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
         &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
@@ -179,11 +179,33 @@ void cuMask(
     ocl.releaseMemChannels(mask_dc);
 }
 
+void cuConvolutionEx(
+    CUdeviceptr result/*out*/,
+    const CUdeviceptr inp, size_t xsize, size_t ysize,
+    const CUdeviceptr multipliers, size_t len,
+    int xstep, int offset, float border_ratio)
+{
+    ocu_args_d_t &ocl = getOcu();
+
+    size_t oxsize = (xsize + xstep - 1) / xstep;
+
+    const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio };
+
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX],
+        oxsize, ysize, 1,
+        1, 1, 1,
+        0,
+        ocl.stream, (void**)args, NULL);
+
+    err = cuStreamSynchronize(ocl.stream);
+}
+
+
 void cuConvolutionXEx(
     CUdeviceptr result/*out*/,
     const CUdeviceptr inp, size_t xsize, size_t ysize,
     const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
+    int xstep, int offset, float border_ratio)
 {
     ocu_args_d_t &ocl = getOcu();
 
@@ -204,12 +226,11 @@ void cuConvolutionYEx(
     const CUdeviceptr multipliers, size_t len,
     int xstep, int offset, double border_ratio)
 {
-    CUresult err = CUDA_SUCCESS;
     ocu_args_d_t &ocl = getOcu();
 
     const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY],
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY],
         xsize, ysize, 1,
         1, 1, 1,
         0,
@@ -223,12 +244,11 @@ void cuSquareSampleEx(
     const CUdeviceptr image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep)
 {
-    CUresult err = CUDA_SUCCESS;
     ocu_args_d_t &ocu = getOcu();
 
     const void *args[] = { &result, &image, &xstep, &ystep };
 
-    err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
+    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
         xsize, ysize, 1,
         1, 1, 1,
         0,
@@ -253,7 +273,6 @@ void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ys
 
     const int xstep = std::max<int>(1, int(sigma / 3));
 
-    CUresult err = CUDA_SUCCESS;
     ocu_args_d_t &ocu = getOcu();
     CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
@@ -282,7 +301,6 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
 
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    CUresult err = CUDA_SUCCESS;
     ocu_args_d_t &ocl = getOcu();
     ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size);
 
@@ -292,7 +310,7 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
 
     void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
 
-    CUresult r = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE],
+    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE],
         xsize * ysize, 1, 1,
         1, 1, 1,
         0,
@@ -694,7 +712,8 @@ void cuCombineChannelsEx(
         &mask.r, &mask.g, &mask.b,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
         &xsize, &ysize,
-        &block_diff_dc, &block_diff_ac, &edge_detector_map,
+        &block_diff_dc, &block_diff_ac,
+		&edge_detector_map,
         &res_xsize,
         &step };
 

From 39bcbd149e88fcc12e21dd20542b012d6530c878 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 4 Jun 2017 14:38:19 +0800
Subject: [PATCH 131/189] =?UTF-8?q?=E7=B2=BE=E7=AE=80=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl.h   |  30 ++---
 clguetzli/clguetzli.cpp    |  76 +++----------
 clguetzli/clguetzli_test.h |   2 -
 clguetzli/cuguetzli.cpp    | 220 ++++++++++++++++++++-----------------
 clguetzli/cuguetzli.h      |  54 ++++-----
 clguetzli/ocl.cpp          |  81 +++++++++-----
 clguetzli/ocl.h            |  64 +++--------
 clguetzli/ocu.cpp          |  10 +-
 clguetzli/ocu.h            |   6 +-
 9 files changed, 254 insertions(+), 289 deletions(-)

diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 4e461399..6dec83c8 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -19,6 +19,7 @@
 
     typedef unsigned char uchar;
     typedef unsigned short ushort;
+    typedef CUdeviceptr cu_mem;
 
     int get_global_id(int dim);
     int get_global_size(int dim);
@@ -78,19 +79,19 @@
         {
             struct
             {
-                CUdeviceptr r;
-                CUdeviceptr g;
-                CUdeviceptr b;
+                cu_mem r;
+                cu_mem g;
+                cu_mem b;
             };
             struct
             {
-                CUdeviceptr x;
-                CUdeviceptr y;
-                CUdeviceptr b_;
+                cu_mem x;
+                cu_mem y;
+                cu_mem b_;
             };
             union
             {
-                CUdeviceptr ch[3];
+                cu_mem ch[3];
             };
         }ocu_channels;
     #endif
@@ -100,22 +101,7 @@
 #ifdef __OPENCL_VERSION__
     #define __constant_ex __constant
     #define __device__
-/*
-    typedef union ocl_channels_t
-    {
-        struct
-        {
-            float * r;
-            float * g;
-            float * b;
-        };
 
-        union
-        {
-            float *ch[3];
-        };
-    }ocl_channels;
-*/
 #endif /*__OPENCL_VERSION__*/
 
 #ifdef __CUDACC__
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 53be7348..15feb7d1 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -8,61 +8,6 @@ extern bool g_useOpenCL = false;
 extern bool g_useCuda = false;
 extern bool g_checkOpenCL = false;
 
-ocl_args_d_t& getOcl(void)
-{
-	static bool bInit = false;
-	static ocl_args_d_t ocl;
-
-	if (bInit == true) return ocl;
-
-	bInit = true;
-	cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
-    LOG_CL_RESULT(err);
-
-	char* source = nullptr;
-	size_t src_size = 0;
-	ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
-
-	ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
-
-	delete[] source;
-
-	err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL);
-    LOG_CL_RESULT(err);
-    if (CL_BUILD_PROGRAM_FAILURE == err)
-    {
-        size_t log_size = 0;
-        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-
-        std::vector<char> build_log(log_size);
-        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
-
-        LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
-    }
-	
-    ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err);
-    ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err);
-    ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err);
-    ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err);
-	ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err);
-    ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err);
-    ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err);
-    ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err);
-    ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err);
-    ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err);
-    ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err);
-    ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err);
-    ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err);
-    ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err);
-    ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err);
-    ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err);
-    ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err);
-    ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err);
-    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err);
-
-	return ocl;
-}
-
 void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -250,7 +195,8 @@ void clConvolutionEx(
 	size_t oxsize = (xsize + xstep - 1) / xstep;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
-    clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, 
+					&result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { oxsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -268,7 +214,8 @@ void clConvolutionXEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
-    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, 
+				&result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -286,7 +233,8 @@ void clConvolutionYEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
-    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, 
+			&result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -364,7 +312,8 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
-    clSetKernelArgEx(kernel, &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
+    clSetKernelArgEx(kernel, 
+					&rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
 
 	size_t globalWorkSize[1] = { xsize * ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -396,10 +345,11 @@ void clMaskHighIntensityChangeEx(
 	clFinish(ocl.commandQueue);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
-    clSetKernelArgEx(kernel, &xyb0.r, &xyb0.g, &xyb0.b,
-                            &xyb1.r, &xyb1.g, &xyb1.b,
-                            &c0.r, &c0.g, &c0.b,
-                            &c1.r, &c1.g, &c1.b);
+    clSetKernelArgEx(kernel, 
+		&xyb0.r, &xyb0.g, &xyb0.b,
+    	&xyb1.r, &xyb1.g, &xyb1.b,
+        &c0.r, &c0.g, &c0.b,
+        &c1.r, &c1.g, &c1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index b27c7942..94c0a2c6 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -1,8 +1,6 @@
 #pragma once
 #include "ocl.h"
 
-ocl_args_d_t& getOcl(void);
-
 void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize,
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index ced35da7..9ffae8b4 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -4,6 +4,8 @@
 
 #ifdef __USE_CUDA__
 
+#define cuFinish cuStreamSynchronize
+
 void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -37,11 +39,11 @@ void cuDiffmapOpsinDynamicsImage(
     ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
     ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
 
-    CUdeviceptr mem_result = ocl.allocMem(channel_size, result);
+    cu_mem mem_result = ocl.allocMem(channel_size, result);
 
-    CUdeviceptr edge_detector_map = ocl.allocMem(3 * channel_step_size);
-    CUdeviceptr block_diff_dc = ocl.allocMem(3 * channel_step_size);
-    CUdeviceptr block_diff_ac = ocl.allocMem(3 * channel_step_size);
+    cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
 
     cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
@@ -95,9 +97,9 @@ void cuComputeBlockZeroingOrder(
     cl_int err = 0;
     ocu_args_d_t &ocl = getOcu();
 
-    CUdeviceptr mem_orig_coeff[3];
-    CUdeviceptr mem_mayout_coeff[3];
-    CUdeviceptr mem_mayout_pixel[3];
+    cu_mem mem_orig_coeff[3];
+    cu_mem mem_mayout_coeff[3];
+    cu_mem mem_mayout_pixel[3];
     for (int c = 0; c < 3; c++)
     {
         int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
@@ -108,11 +110,11 @@ void cuComputeBlockZeroingOrder(
 
         mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
     }
-    CUdeviceptr mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
-    CUdeviceptr mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+    cu_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    cu_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
 
     int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
-    CUdeviceptr mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
+    cu_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
 
     const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
         &mem_orig_image, &mem_mask_scale,
@@ -130,8 +132,10 @@ void cuComputeBlockZeroingOrder(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
+    LOG_CU_RESULT(err);
 
-    err = cuStreamSynchronize(ocl.stream);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
 
@@ -180,9 +184,9 @@ void cuMask(
 }
 
 void cuConvolutionEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
     int xstep, int offset, float border_ratio)
 {
     ocu_args_d_t &ocl = getOcu();
@@ -196,15 +200,16 @@ void cuConvolutionEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
 
 void cuConvolutionXEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
     int xstep, int offset, float border_ratio)
 {
     ocu_args_d_t &ocl = getOcu();
@@ -216,15 +221,16 @@ void cuConvolutionXEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
 void cuConvolutionYEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio)
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio)
 {
     ocu_args_d_t &ocl = getOcu();
 
@@ -235,13 +241,14 @@ void cuConvolutionYEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
 void cuSquareSampleEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr image, size_t xsize, size_t ysize,
+    cu_mem result/*out*/,
+    const cu_mem image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep)
 {
     ocu_args_d_t &ocu = getOcu();
@@ -253,13 +260,14 @@ void cuSquareSampleEx(
         1, 1, 1,
         0,
         ocu.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocu.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocu.stream);
+	LOG_CU_RESULT(err);
 }
 
-void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
     const double sigma, const double border_ratio,
-    CUdeviceptr result/*out, opt*/)
+    cu_mem result/*out, opt*/)
 {
     double m = 2.25;  // Accuracy increases when m is increased.
     const double scaler = -1.0 / (2 * sigma * sigma);
@@ -273,23 +281,23 @@ void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ys
 
     const int xstep = std::max<int>(1, int(sigma / 3));
 
-    ocu_args_d_t &ocu = getOcu();
-    CUdeviceptr mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
+    ocu_args_d_t &ocl = getOcu();
+    cu_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
     if (xstep > 1)
     {
-        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
-        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
         cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
-        cuMemFree(srcA);
+        cuMemFree(m);
     }
     else
     {
-        CUdeviceptr srcA = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
-        cuConvolutionXEx(srcA, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuConvolutionYEx(result ? result : image, srcA, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuMemFree(srcA);
+        cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+        cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
+        cuMemFree(m);
     }
 
     cuMemFree(mem_expn);
@@ -315,8 +323,9 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
         1, 1, 1,
         0,
         ocl.stream, args, NULL);
-
-    r = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    r = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(rgb_blurred);
 }
@@ -333,14 +342,16 @@ void cuMaskHighIntensityChangeEx(
     ocu_channels c0 = ocl.allocMemChannels(channel_size);
     ocu_channels c1 = ocl.allocMemChannels(channel_size);
 
-    cuMemcpyDtoD(c0.r, xyb0.r, channel_size);
-    cuMemcpyDtoD(c0.g, xyb0.g, channel_size);
-    cuMemcpyDtoD(c0.b, xyb0.b, channel_size);
-    cuMemcpyDtoD(c1.r, xyb1.r, channel_size);
-    cuMemcpyDtoD(c1.g, xyb1.g, channel_size);
-    cuMemcpyDtoD(c1.b, xyb1.b, channel_size);
+    cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.stream);
+    cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.stream);
+    cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.stream);
+    cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.stream);
+    cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.stream);
+    cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.stream);
+	cuFinish(ocl.stream);
 
-    const void *args[] = { &xyb0.r, &xyb0.g, &xyb0.b,
+    const void *args[] = { 
+		&xyb0.r, &xyb0.g, &xyb0.b,
         &xyb1.r, &xyb1.g, &xyb1.b,
         &c0.r, &c0.g, &c0.b,
         &c1.r, &c1.g, &c1.b };
@@ -350,15 +361,16 @@ void cuMaskHighIntensityChangeEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(c0);
     ocl.releaseMemChannels(c1);
 }
 
 void cuEdgeDetectorMapEx(
-    CUdeviceptr result/*out*/,
+    cu_mem result/*out*/,
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step)
 {
@@ -390,16 +402,17 @@ void cuEdgeDetectorMapEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(rgb_blured);
     ocl.releaseMemChannels(rgb2_blured);
 }
 
 void cuBlockDiffMapEx(
-    CUdeviceptr block_diff_dc/*out*/,
-    CUdeviceptr block_diff_ac/*out*/,
+    cu_mem block_diff_dc/*out*/,
+    cu_mem block_diff_ac/*out*/,
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step)
 {
@@ -418,12 +431,13 @@ void cuBlockDiffMapEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
 void cuEdgeDetectorLowFreqEx(
-    CUdeviceptr block_diff_ac/*in,out*/,
+    cu_mem block_diff_ac/*in,out*/,
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step)
 {
@@ -454,8 +468,9 @@ void cuEdgeDetectorLowFreqEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(rgb_blured);
     ocl.releaseMemChannels(rgb2_blured);
@@ -477,11 +492,12 @@ void cuDiffPrecomputeEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
-void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
+void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
 {
     ocu_args_d_t &ocl = getOcu();
 
@@ -492,11 +508,12 @@ void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w)
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
-void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize)
+void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize)
 {
     if (xsize < 4 || ysize < 4) {
         // TODO: Make this work for small dimensions as well.
@@ -506,7 +523,7 @@ void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t
     ocu_args_d_t &ocl = getOcu();
 
     size_t len = xsize * ysize * sizeof(float);
-    CUdeviceptr img_org = ocl.allocMem(len);
+    cu_mem img_org = ocl.allocMem(len);
 
     cuMemcpyDtoD(img_org, img, len);
 
@@ -517,20 +534,21 @@ void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     cuMemFree(img_org);
 }
 
 void cuMinSquareValEx(
-    CUdeviceptr img/*in,out*/,
+    cu_mem img/*in,out*/,
     const size_t xsize, const size_t ysize,
     const size_t square_size, const size_t offset)
 {
     ocu_args_d_t &ocl = getOcu();
 
-    CUdeviceptr srcA = ocl.allocMem(sizeof(float) * xsize * ysize);
+    cu_mem srcA = ocl.allocMem(sizeof(float) * xsize * ysize);
 
     const void *args[] = { &srcA, &img, &square_size, &offset };
 
@@ -539,9 +557,9 @@ void cuMinSquareValEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
     cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize);
     cuMemFree(srcA);
 }
@@ -656,8 +674,9 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(xyb);
     ocl.releaseMemChannels(xyb_dc);
@@ -693,13 +712,13 @@ void cuMaskEx(
 }
 
 void cuCombineChannelsEx(
-    CUdeviceptr result/*out*/,
+    cu_mem result/*out*/,
     const ocu_channels &mask,
     const ocu_channels &mask_dc,
     const size_t xsize, const size_t ysize,
-    const CUdeviceptr block_diff_dc,
-    const CUdeviceptr block_diff_ac,
-    const CUdeviceptr edge_detector_map,
+    const cu_mem block_diff_dc,
+    const cu_mem block_diff_ac,
+    const cu_mem edge_detector_map,
     const size_t res_xsize,
     const size_t step)
 {
@@ -722,15 +741,16 @@ void cuCombineChannelsEx(
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
-void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step)
+void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step)
 {
     ocu_args_d_t &ocl = getOcu();
 
-    CUdeviceptr diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
+    cu_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
 
     const void *args[] = { &diffmap_out,
         &diffmap,
@@ -745,15 +765,15 @@ void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
-
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
     cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
 
     cuMemFree(diffmap_out);
 }
 
-void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step)
+void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step)
 {
     ocu_args_d_t &ocl = getOcu();
 
@@ -771,11 +791,12 @@ void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize,
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
-void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdeviceptr in)
+void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in)
 {
     ocu_args_d_t &ocl = getOcu();
 
@@ -792,11 +813,12 @@ void cuAddBorderEx(CUdeviceptr out, size_t xsize, size_t ysize, int step, CUdevi
         1, 1, 1,
         0,
         ocl.stream, (void**)args, NULL);
-
-    err = cuStreamSynchronize(ocl.stream);
+	LOG_CU_RESULT(err);
+    err = cuFinish(ocl.stream);
+	LOG_CU_RESULT(err);
 }
 
-void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
+void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step)
 {
     cuUpsampleSquareRootEx(diffmap, xsize, ysize, step);
 
@@ -808,7 +830,7 @@ void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, con
     int s2 = (8 - step) / 2;
 
     ocu_args_d_t &ocl = getOcu();
-    CUdeviceptr blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+    cu_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
     cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
 
     static const double border_ratio = 0.03027655136;
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
index e9dddde6..81ec377b 100644
--- a/clguetzli/cuguetzli.h
+++ b/clguetzli/cuguetzli.h
@@ -35,25 +35,25 @@ void cuMask(
     const float* r2, const float* g2, const float* b2);
 
 void cuConvolutionXEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio);
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
 
 void cuConvolutionYEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr inp, size_t xsize, size_t ysize,
-    const CUdeviceptr multipliers, size_t len,
-    int xstep, int offset, double border_ratio);
+    cu_mem result/*out*/,
+    const cu_mem inp, size_t xsize, size_t ysize,
+    const cu_mem multipliers, size_t len,
+    int xstep, int offset, float border_ratio);
 
 void cuSquareSampleEx(
-    CUdeviceptr result/*out*/,
-    const CUdeviceptr image, size_t xsize, size_t ysize,
+    cu_mem result/*out*/,
+    const cu_mem image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep);
 
-void cuBlurEx(CUdeviceptr image/*out, opt*/, const size_t xsize, const size_t ysize,
+void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
     const double sigma, const double border_ratio,
-    CUdeviceptr result = NULL/*out, opt*/);
+    cu_mem result = NULL/*out, opt*/);
 
 void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize);
 
@@ -63,18 +63,18 @@ void cuMaskHighIntensityChangeEx(
     const size_t xsize, const size_t ysize);
 
 void cuEdgeDetectorMapEx(
-    CUdeviceptr result/*out*/,
+    cu_mem result/*out*/,
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step);
 
 void cuBlockDiffMapEx(
-    CUdeviceptr block_diff_dc/*out*/,
-    CUdeviceptr block_diff_ac/*out*/,
+    cu_mem block_diff_dc/*out*/,
+    cu_mem block_diff_ac/*out*/,
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step);
 
 void cuEdgeDetectorLowFreqEx(
-    CUdeviceptr block_diff_ac/*in,out*/,
+    cu_mem block_diff_ac/*in,out*/,
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step);
 
@@ -83,12 +83,12 @@ void cuDiffPrecomputeEx(
     const ocu_channels &xyb0, const ocu_channels &xyb1,
     const size_t xsize, const size_t ysize);
 
-void cuScaleImageEx(CUdeviceptr img/*in, out*/, size_t size, double w);
+void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w);
 
-void cuAverage5x5Ex(CUdeviceptr img/*in,out*/, const size_t xsize, const size_t ysize);
+void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize);
 
 void cuMinSquareValEx(
-    CUdeviceptr img/*in,out*/,
+    cu_mem img/*in,out*/,
     const size_t xsize, const size_t ysize,
     const size_t square_size, const size_t offset);
 
@@ -98,22 +98,22 @@ void cuMaskEx(
     const size_t xsize, const size_t ysize);
 
 void cuCombineChannelsEx(
-    CUdeviceptr result/*out*/,
+    cu_mem result/*out*/,
     const ocu_channels &mask,
     const ocu_channels &mask_dc,
     const size_t xsize, const size_t ysize,
-    const CUdeviceptr block_diff_dc,
-    const CUdeviceptr block_diff_ac,
-    const CUdeviceptr edge_detector_map,
+    const cu_mem block_diff_dc,
+    const cu_mem block_diff_ac,
+    const cu_mem edge_detector_map,
     const size_t res_xsize,
     const size_t step);
 
-void cuUpsampleSquareRootEx(CUdeviceptr diffmap, const size_t xsize, const size_t ysize, const int step);
+void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step);
 
-void cuRemoveBorderEx(CUdeviceptr out, const CUdeviceptr in, const size_t xsize, const size_t ysize, const int step);
+void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step);
 
-void cuAddBorderEx(CUdeviceptr out, const size_t xsize, const size_t ysize, const int step, const CUdeviceptr in);
+void cuAddBorderEx(cu_mem out, const size_t xsize, const size_t ysize, const int step, const cu_mem in);
 
-void cuCalculateDiffmapEx(CUdeviceptr diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
+void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
 
 #endif
\ No newline at end of file
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 5218ce9b..639ad68e 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -1,12 +1,61 @@
 #include "ocl.h"
 #include <string.h>
-#ifdef __linux__
-#include <malloc.h>
-#define _aligned_malloc memalign
-#define _aligned_free free
-#endif
 #include <vector>
 
+ocl_args_d_t& getOcl(void)
+{
+    static bool bInit = false;
+    static ocl_args_d_t ocl;
+
+    if (bInit == true) return ocl;
+
+    bInit = true;
+    cl_int err = SetupOpenCL(&ocl, CL_DEVICE_TYPE_GPU);
+    LOG_CL_RESULT(err);
+
+    char* source = nullptr;
+    size_t src_size = 0;
+    ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
+
+    ocl.program = clCreateProgramWithSource(ocl.context, 1, (const char**)&source, &src_size, &err);
+
+    delete[] source;
+
+    err = clBuildProgram(ocl.program, 1, &ocl.device, "", NULL, NULL);
+    LOG_CL_RESULT(err);
+    if (CL_BUILD_PROGRAM_FAILURE == err)
+    {
+        size_t log_size = 0;
+        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+
+        std::vector<char> build_log(log_size);
+        clGetProgramBuildInfo(ocl.program, ocl.device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
+
+        LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
+    }
+
+    ocl.kernel[KERNEL_CONVOLUTION] = clCreateKernel(ocl.program, "clConvolutionEx", &err);
+    ocl.kernel[KERNEL_CONVOLUTIONX] = clCreateKernel(ocl.program, "clConvolutionXEx", &err);
+    ocl.kernel[KERNEL_CONVOLUTIONY] = clCreateKernel(ocl.program, "clConvolutionYEx", &err);
+    ocl.kernel[KERNEL_SQUARESAMPLE] = clCreateKernel(ocl.program, "clSquareSampleEx", &err);
+    ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE] = clCreateKernel(ocl.program, "clOpsinDynamicsImageEx", &err);
+    ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE] = clCreateKernel(ocl.program, "clMaskHighIntensityChangeEx", &err);
+    ocl.kernel[KERNEL_EDGEDETECTOR] = clCreateKernel(ocl.program, "clEdgeDetectorMapEx", &err);
+    ocl.kernel[KERNEL_BLOCKDIFFMAP] = clCreateKernel(ocl.program, "clBlockDiffMapEx", &err);
+    ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ] = clCreateKernel(ocl.program, "clEdgeDetectorLowFreqEx", &err);
+    ocl.kernel[KERNEL_DIFFPRECOMPUTE] = clCreateKernel(ocl.program, "clDiffPrecomputeEx", &err);
+    ocl.kernel[KERNEL_SCALEIMAGE] = clCreateKernel(ocl.program, "clScaleImageEx", &err);
+    ocl.kernel[KERNEL_AVERAGE5X5] = clCreateKernel(ocl.program, "clAverage5x5Ex", &err);
+    ocl.kernel[KERNEL_MINSQUAREVAL] = clCreateKernel(ocl.program, "clMinSquareValEx", &err);
+    ocl.kernel[KERNEL_DOMASK] = clCreateKernel(ocl.program, "clDoMaskEx", &err);
+    ocl.kernel[KERNEL_COMBINECHANNELS] = clCreateKernel(ocl.program, "clCombineChannelsEx", &err);
+    ocl.kernel[KERNEL_UPSAMPLESQUAREROOT] = clCreateKernel(ocl.program, "clUpsampleSquareRootEx", &err);
+    ocl.kernel[KERNEL_REMOVEBORDER] = clCreateKernel(ocl.program, "clRemoveBorderEx", &err);
+    ocl.kernel[KERNEL_ADDBORDER] = clCreateKernel(ocl.program, "clAddBorderEx", &err);
+    ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER] = clCreateKernel(ocl.program, "clComputeBlockZeroingOrderEx", &err);
+
+    return ocl;
+}
 
 ocl_args_d_t::ocl_args_d_t() :
 	context(NULL),
@@ -23,17 +72,6 @@ ocl_args_d_t::ocl_args_d_t() :
 	}
 }
 
-/*
-* destructor - called only once
-* Release all OpenCL objects
-* This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL.
-*
-* You may want to call these deallocation procedures in the middle of your application execution
-* (not at the end) if you don't further need OpenCL runtime.
-* You may want to do that in order to free some memory, for example,
-* or recreate OpenCL objects with different parameters.
-*
-*/
 ocl_args_d_t::~ocl_args_d_t()
 {
 	cl_int err = CL_SUCCESS;
@@ -45,16 +83,7 @@ ocl_args_d_t::~ocl_args_d_t()
 			LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
 		}
 	}
-/*
-	if (kernel)
-	{
-		err = clReleaseKernel(kernel);
-		if (CL_SUCCESS != err)
-		{
-			LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
-		}
-	}
-*/
+
 	if (program)
 	{
 		err = clReleaseProgram(program);
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index 13eb232b..a9573fa6 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -8,48 +8,12 @@
 #define OPENCL_VERSION_1_2  1.2f
 #define OPENCL_VERSION_2_0  2.0f
 
-struct ocl_args_d_t;
-
-/* This function helps to create informative messages in
-* case when OpenCL errors occur. It returns a string
-* representation for an OpenCL error code.
-* (E.g. "CL_DEVICE_NOT_FOUND" instead of just -1.)
-*/
-const char* TranslateOpenCLError(cl_int errorCode);
-
-/*
-* This function picks/creates necessary OpenCL objects which are needed.
-* The objects are:
-* OpenCL platform, device, context, and command queue.
-*
-* All these steps are needed to be performed once in a regular OpenCL application.
-* This happens before actual compute kernels calls are performed.
-*
-* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
-* so this function populates fields of this structure, which is passed as parameter ocl.
-* Please, consider reviewing the fields before going further.
-* The structure definition is right in the beginning of this file.
-*/
-int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
-
-
-/* Convenient container for all OpenCL specific objects used in the sample
-*
-* It consists of two parts:
-*   - regular OpenCL objects which are used in almost each normal OpenCL applications
-*   - several OpenCL objects that are specific for this particular sample
-*
-* You collect all these objects in one structure for utility purposes
-* only, there is no OpenCL specific here: just to avoid global variables
-* and make passing all these arguments in functions easier.
-*/
-
 enum KernelName {
-	KERNEL_CONVOLUTION = 0,
-	KERNEL_CONVOLUTIONX,
-	KERNEL_CONVOLUTIONY,
-	KERNEL_SQUARESAMPLE,
-	KERNEL_OPSINDYNAMICSIMAGE,
+    KERNEL_CONVOLUTION = 0,
+    KERNEL_CONVOLUTIONX,
+    KERNEL_CONVOLUTIONY,
+    KERNEL_SQUARESAMPLE,
+    KERNEL_OPSINDYNAMICSIMAGE,
     KERNEL_MASKHIGHINTENSITYCHANGE,
     KERNEL_EDGEDETECTOR,
     KERNEL_BLOCKDIFFMAP,
@@ -59,16 +23,24 @@ enum KernelName {
     KERNEL_AVERAGE5X5,
     KERNEL_MINSQUAREVAL,
     KERNEL_DOMASK,
-	KERNEL_COMBINECHANNELS,
-	KERNEL_UPSAMPLESQUAREROOT,
+    KERNEL_COMBINECHANNELS,
+    KERNEL_UPSAMPLESQUAREROOT,
     KERNEL_REMOVEBORDER,
-	KERNEL_ADDBORDER,
-	KERNEL_COMPUTEBLOCKZEROINGORDER,
-	KERNEL_COUNT,
+    KERNEL_ADDBORDER,
+    KERNEL_COMPUTEBLOCKZEROINGORDER,
+    KERNEL_COUNT,
 };
 
 #define LOG_CL_RESULT(e)   if (CL_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateOpenCLError((e)));}
 
+struct ocl_args_d_t;
+
+const char* TranslateOpenCLError(cl_int errorCode);
+
+int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType);
+
+ocl_args_d_t& getOcl(void);
+
 struct ocl_args_d_t
 {
 	ocl_args_d_t();
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 6fbf58ee..d15733c9 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -1,6 +1,6 @@
+#include "ocu.h"
 #include <cuda.h>
 #include <nvrtc.h>
-#include "ocu.h"
 
 #ifdef __USE_CUDA__
 
@@ -113,9 +113,9 @@ ocu_args_d_t::~ocu_args_d_t()
 //    cuStreamDestroy(stream);
 }
 
-CUdeviceptr ocu_args_d_t::allocMem(size_t s, const void *init)
+cu_mem ocu_args_d_t::allocMem(size_t s, const void *init)
 {
-    CUdeviceptr mem;
+    cu_mem mem;
     cuMemAlloc(&mem, s);
     if (init)
     {
@@ -151,4 +151,8 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb)
     }
 }
 
+const char* TranslateCUDAError(CUresult errorCode)
+{
+    return "Unknwon";
+}
 #endif
\ No newline at end of file
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index 4c34edaf..426019cc 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -5,8 +5,12 @@
 #include <cuda.h>
 #include "ocl.h"
 
+#define LOG_CU_RESULT(e)   if (CUDA_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateCUDAError((e)));}
+
 struct ocu_args_d_t;
 
+const char* TranslateCUDAError(CUresult errorCode);
+
 ocu_args_d_t& getOcu(void);
 
 struct ocu_args_d_t
@@ -14,7 +18,7 @@ struct ocu_args_d_t
     ocu_args_d_t();
     ~ocu_args_d_t();
 
-    CUdeviceptr allocMem(size_t s, const void *init = NULL);
+    cu_mem allocMem(size_t s, const void *init = NULL);
     ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
     void releaseMemChannels(ocu_channels &rgb);
 

From 1cb6e526d4629896bd4e513d0307a42aa20b5de1 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 4 Jun 2017 22:30:07 +0800
Subject: [PATCH 132/189] =?UTF-8?q?cu=E7=BC=96=E8=AF=91=E6=94=B9=E5=9B=9En?=
 =?UTF-8?q?vcc=E6=8F=90=E5=89=8D=E7=BC=96=E8=AF=91=20=E7=BB=A7=E7=BB=AD?=
 =?UTF-8?q?=E7=AE=80=E5=8C=96=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp |  22 ++---
 clguetzli/cuguetzli.cpp | 189 +++++++++++++++++++++-------------------
 clguetzli/ocu.cpp       | 138 ++++++++++++++++++++++-------
 clguetzli/ocu.h         |   2 +-
 compile.bat             |   5 +-
 guetzli.vcxproj         |  20 +++--
 6 files changed, 231 insertions(+), 145 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 15feb7d1..4495e935 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -97,7 +97,6 @@ void clComputeBlockZeroingOrder(
 
     using namespace guetzli;
 
-    cl_int err = 0;
     ocl_args_d_t &ocl = getOcl();
 
     cl_mem mem_orig_coeff[3];
@@ -132,7 +131,7 @@ void clComputeBlockZeroingOrder(
 						&mem_output_order_batch);
 
     size_t globalWorkSize[2] = { blockf_width, blockf_height };
-    err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+    cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
     LOG_CL_RESULT(err);
     err = clFinish(ocl.commandQueue);
     LOG_CL_RESULT(err);
@@ -195,8 +194,7 @@ void clConvolutionEx(
 	size_t oxsize = (xsize + xstep - 1) / xstep;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTION];
-    clSetKernelArgEx(kernel, 
-					&result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { oxsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -214,8 +212,7 @@ void clConvolutionXEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
-    clSetKernelArgEx(kernel, 
-				&result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -233,8 +230,7 @@ void clConvolutionYEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
-    clSetKernelArgEx(kernel, 
-			&result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -312,8 +308,7 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
-    clSetKernelArgEx(kernel, 
-					&rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
+    clSetKernelArgEx(kernel,  &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
 
 	size_t globalWorkSize[1] = { xsize * ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -367,6 +362,7 @@ void clEdgeDetectorMapEx(
     const size_t xsize, const size_t ysize, const size_t step)
 {
 	size_t channel_size = xsize * ysize * sizeof(float);
+ 
 	ocl_args_d_t &ocl = getOcl();
 
 	ocl_channels rgb_blured = ocl.allocMemChannels(channel_size);
@@ -470,7 +466,7 @@ void clDiffPrecomputeEx(
 	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
     clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, 
                             &xyb0.x, &xyb0.y, &xyb0.b,
-        &xyb1.x, &xyb1.y, &xyb1.b);
+							&xyb1.x, &xyb1.y, &xyb1.b);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -484,7 +480,7 @@ void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-    clSetKernelArgEx(kernel, &img, &w);
+	clSetKernelArgEx(kernel, &img, &w);
 
 	size_t globalWorkSize[1] = { size };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -704,7 +700,7 @@ void clCombineChannelsEx(
 
 	cl_kernel kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
     clSetKernelArgEx(kernel, &result, 
-                            &mask.r, &mask.g, &mask.b,
+    	&mask.r, &mask.g, &mask.b,
         &mask_dc.r, &mask_dc.g, &mask_dc.b, 
         &xsize, &ysize,
         &block_diff_dc, &block_diff_ac,
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 9ffae8b4..0a464a77 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -74,7 +74,6 @@ void cuDiffmapOpsinDynamicsImage(
     cuMemFree(mem_result);
 }
 
-
 void cuComputeBlockZeroingOrder(
     guetzli::CoeffData *output_order_batch,
     const channel_info orig_channel[3],
@@ -94,7 +93,6 @@ void cuComputeBlockZeroingOrder(
 
     using namespace guetzli;
 
-    cl_int err = 0;
     ocu_args_d_t &ocl = getOcu();
 
     cu_mem mem_orig_coeff[3];
@@ -116,6 +114,7 @@ void cuComputeBlockZeroingOrder(
     int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
     cu_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
 
+    CUfunction kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
         &mem_orig_image, &mem_mask_scale,
         &image_width, &image_height,
@@ -127,14 +126,14 @@ void cuComputeBlockZeroingOrder(
         &BlockErrorLimit,
         &mem_output_order_batch };
 
-    err = cuLaunchKernel(ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER],
+    CUresult err = cuLaunchKernel(kernel,
         blockf_width, blockf_height, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
     LOG_CU_RESULT(err);
 
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
@@ -170,12 +169,13 @@ void cuMask(
 
     cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
 
-    cuMemcpyDtoH(mask_r, mask.r, channel_size);
-    cuMemcpyDtoH(mask_g, mask.g, channel_size);
-    cuMemcpyDtoH(mask_b, mask.b, channel_size);
-    cuMemcpyDtoH(maskdc_r, mask_dc.r, channel_size);
-    cuMemcpyDtoH(maskdc_g, mask_dc.g, channel_size);
-    cuMemcpyDtoH(maskdc_b, mask_dc.b, channel_size);
+    cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocl.commandQueue);
+    cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocl.commandQueue);
+    cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocl.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocl.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocl.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocl.commandQueue);
+    cuFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
     ocl.releaseMemChannels(rgb2);
@@ -193,15 +193,16 @@ void cuConvolutionEx(
 
     size_t oxsize = (xsize + xstep - 1) / xstep;
 
+	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTION];
     const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX],
+    CUresult err = cuLaunchKernel(kernel,
         oxsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -214,15 +215,16 @@ void cuConvolutionXEx(
 {
     ocu_args_d_t &ocl = getOcu();
 
+	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
     const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONX],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -234,15 +236,16 @@ void cuConvolutionYEx(
 {
     ocu_args_d_t &ocl = getOcu();
 
+	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
     const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_CONVOLUTIONY],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -251,17 +254,18 @@ void cuSquareSampleEx(
     const cu_mem image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep)
 {
-    ocu_args_d_t &ocu = getOcu();
+    ocu_args_d_t &ocl = getOcu();
 
+	CUfunction kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
     const void *args[] = { &result, &image, &xstep, &ystep };
 
-    CUresult err = cuLaunchKernel(ocu.kernel[KERNEL_SQUARESAMPLE],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocu.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocu.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -316,15 +320,16 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
     cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
     cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
+	CUfunction kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
     void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE],
+    CUresult err = cuLaunchKernel(kernel,
         xsize * ysize, 1, 1,
         1, 1, 1,
         0,
-        ocl.stream, args, NULL);
+        ocl.commandQueue, args, NULL);
 	LOG_CU_RESULT(err);
-    r = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(rgb_blurred);
@@ -342,27 +347,28 @@ void cuMaskHighIntensityChangeEx(
     ocu_channels c0 = ocl.allocMemChannels(channel_size);
     ocu_channels c1 = ocl.allocMemChannels(channel_size);
 
-    cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.stream);
-    cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.stream);
-    cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.stream);
-    cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.stream);
-    cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.stream);
-    cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.stream);
-	cuFinish(ocl.stream);
+    cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.commandQueue);
+    cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.commandQueue);
+    cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.commandQueue);
+    cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.commandQueue);
+    cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.commandQueue);
+    cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.commandQueue);
+	cuFinish(ocl.commandQueue);
 
+	CUfunction kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
     const void *args[] = { 
 		&xyb0.r, &xyb0.g, &xyb0.b,
         &xyb1.r, &xyb1.g, &xyb1.b,
         &c0.r, &c0.g, &c0.b,
         &c1.r, &c1.g, &c1.b };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(c0);
@@ -389,6 +395,7 @@ void cuEdgeDetectorMapEx(
         cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
     }
 
+	CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
     const void *args[] = { &result,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
         &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
@@ -397,13 +404,13 @@ void cuEdgeDetectorMapEx(
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTOR],
+    CUresult err = cuLaunchKernel(kernel,
         res_xsize, res_ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(rgb_blured);
@@ -418,6 +425,7 @@ void cuBlockDiffMapEx(
 {
     ocu_args_d_t &ocl = getOcu();
 
+	CUfunction kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
     const void *args[] = { &block_diff_dc, &block_diff_ac,
         &rgb.r, &rgb.g, &rgb.b,
         &rgb2.r, &rgb2.g, &rgb2.b,
@@ -426,13 +434,13 @@ void cuBlockDiffMapEx(
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_BLOCKDIFFMAP],
+    CUresult err = cuLaunchKernel(kernel,
         res_xsize, res_ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -455,6 +463,7 @@ void cuEdgeDetectorLowFreqEx(
         cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
     }
 
+	CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
     const void *args[] = { &block_diff_ac,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
         &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
@@ -463,13 +472,13 @@ void cuEdgeDetectorLowFreqEx(
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ],
+    CUresult err = cuLaunchKernel(kernel,
         res_xsize, res_ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(rgb_blured);
@@ -483,17 +492,18 @@ void cuDiffPrecomputeEx(
 {
     ocu_args_d_t &ocl = getOcu();
 
+	CUfunction kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
     const void *args[] = { &mask.x, &mask.y, &mask.b,
         &xyb0.x, &xyb0.y, &xyb0.b,
         &xyb1.x, &xyb1.y, &xyb1.b };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DIFFPRECOMPUTE],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -501,15 +511,16 @@ void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
 {
     ocu_args_d_t &ocl = getOcu();
 
+	CUfunction kernel = ocl.kernel[KERNEL_SCALEIMAGE];
     const void *args[] = { &img, &w };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_SCALEIMAGE],
+    CUresult err = cuLaunchKernel(kernel,
         size, 1, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -527,15 +538,16 @@ void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize
 
     cuMemcpyDtoD(img_org, img, len);
 
+	CUfunction kernel = ocl.kernel[KERNEL_AVERAGE5X5];
     const void *args[] = { &img, &img_org };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_AVERAGE5X5],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     cuMemFree(img_org);
@@ -548,20 +560,21 @@ void cuMinSquareValEx(
 {
     ocu_args_d_t &ocl = getOcu();
 
-    cu_mem srcA = ocl.allocMem(sizeof(float) * xsize * ysize);
+    cu_mem result = ocl.allocMem(sizeof(float) * xsize * ysize);
 
-    const void *args[] = { &srcA, &img, &square_size, &offset };
+	CUfunction kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+    const void *args[] = { &result, &img, &square_size, &offset };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_MINSQUAREVAL],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
-    cuMemcpyDtoD(img, srcA, sizeof(float) * xsize * ysize);
-    cuMemFree(srcA);
+    cuMemcpyDtoD(img, result, sizeof(float) * xsize * ysize);
+    cuMemFree(result);
 }
 
 static void MakeMask(double extmul, double extoff,
@@ -664,18 +677,19 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz
     ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
     ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
 
+	CUfunction kernel = ocl.kernel[KERNEL_DOMASK];
     const void *args[] = { &mask.r, &mask.g, &mask.b,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
         &xyb.x, &xyb.y, &xyb.b,
         &xyb_dc.x, &xyb_dc.y, &xyb_dc.b };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_DOMASK],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 
     ocl.releaseMemChannels(xyb);
@@ -727,6 +741,7 @@ void cuCombineChannelsEx(
     const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
     const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
 
+	CUfunction kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
     const void *args[] = { &result,
         &mask.r, &mask.g, &mask.b,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
@@ -736,13 +751,13 @@ void cuCombineChannelsEx(
         &res_xsize,
         &step };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_COMBINECHANNELS],
+    CUresult err = cuLaunchKernel(kernel,
         work_xsize, work_ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -752,21 +767,19 @@ void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysi
 
     cu_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
 
-    const void *args[] = { &diffmap_out,
-        &diffmap,
-        &xsize, &ysize,
-        &step };
+	CUfunction kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
+    const void *args[] = { &diffmap_out, &diffmap, &xsize, &ysize, &step };
 
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_UPSAMPLESQUAREROOT],
+    CUresult err = cuLaunchKernel(kernel,
         res_xsize, res_ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
     cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
 
@@ -780,19 +793,16 @@ void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const siz
     int cls = 8 - step;
     int cls2 = (8 - step) / 2;
 
-    const void *args[] = { &out,
-        &in,
-        &xsize,
-        &cls,
-        &cls2 };
+	CUfunction kernel = ocl.kernel[KERNEL_REMOVEBORDER];
+    const void *args[] = { &out, &in, &xsize, &cls, &cls2 };
 
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_REMOVEBORDER],
+    CUresult err = cuLaunchKernel(kernel,
         xsize - cls, ysize - cls, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -802,19 +812,16 @@ void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in)
 
     int cls = 8 - step;
     int cls2 = (8 - step) / 2;
+	CUfunction kernel = ocl.kernel[KERNEL_ADDBORDER];
+    const void *args[] = { &out, &cls, &cls2, &in };
 
-    const void *args[] = { &out,
-        &cls,
-        &cls2,
-        &in };
-
-    CUresult err = cuLaunchKernel(ocl.kernel[KERNEL_ADDBORDER],
+    CUresult err = cuLaunchKernel(kernel,
         xsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.stream, (void**)args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.stream);
+    err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index d15733c9..8923bcd4 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -13,12 +13,14 @@ ocu_args_d_t& getOcu(void)
 
     bInit = true;
 
-    CUresult r = cuInit(0);
+    CUresult err = cuInit(0);
+    LOG_CU_RESULT(err);
     CUdevice dev = 0;
     CUcontext ctxt;
     CUstream  stream;
 
-    r = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev);
+    err = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev);
+    LOG_CU_RESULT(err);
 
     char name[1024];
     int proc_count = 0;
@@ -30,7 +32,7 @@ ocu_args_d_t& getOcu(void)
     cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
     cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
     LogError("CUDA Adapter:%s Ver%d.%d MP %d Core %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
-
+/*
     char* source = nullptr;
     size_t src_size = 0;
     ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
@@ -38,7 +40,7 @@ ocu_args_d_t& getOcu(void)
     nvrtcProgram prog;
     const char *opts[] = { "-arch=compute_30", "-default-device", "-G", "-I\"./\"", "--fmad=false" };
     nvrtcCreateProgram(&prog, source, "clguetzli.cl", 0, NULL, NULL);
-    nvrtcResult compile_result = nvrtcCompileProgram(prog, 3, opts);
+    nvrtcResult compile_result;// = nvrtcCompileProgram(prog, 3, opts);
     if (NVRTC_SUCCESS != compile_result)
     {
         // Obtain compilation log from the program.
@@ -52,41 +54,51 @@ ocu_args_d_t& getOcu(void)
         delete[] log;
     }
 
+    delete[] source;
     // Obtain PTX from the program.
     size_t ptxSize = 0;
     nvrtcGetPTXSize(prog, &ptxSize);
     char *ptx = new char[ptxSize];
     nvrtcGetPTX(prog, ptx);
+*/
+
+    char* ptx = nullptr;
+    size_t src_size = 0;
+#ifdef _WIN64
+    ReadSourceFromFile("clguetzli/clguetzli.cu.ptx64", &ptx, &src_size);
+#else
+    ReadSourceFromFile("clguetzli/clguetzli.cu.ptx32", &ptx, &src_size);
+#endif
 
     CUmodule mod;
     CUjit_option jit_options[2];
     void *jit_optvals[2];
     jit_options[0] = CU_JIT_CACHE_MODE;
     jit_optvals[0] = (void*)(uintptr_t)CU_JIT_CACHE_OPTION_CA;
-    r = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals);
+    err = cuModuleLoadDataEx(&mod, ptx, 1, jit_options, jit_optvals);
+    LOG_CU_RESULT(err);
 
-    delete[] source;
     delete[] ptx;
 
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx");
-    r = cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTION], mod, "clConvolutionEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONX], mod, "clConvolutionXEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_CONVOLUTIONY], mod, "clConvolutionYEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_SQUARESAMPLE], mod, "clSquareSampleEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE], mod, "clOpsinDynamicsImageEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE], mod, "clMaskHighIntensityChangeEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTOR], mod, "clEdgeDetectorMapEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_BLOCKDIFFMAP], mod, "clBlockDiffMapEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ], mod, "clEdgeDetectorLowFreqEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_DIFFPRECOMPUTE], mod, "clDiffPrecomputeEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_SCALEIMAGE], mod, "clScaleImageEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_AVERAGE5X5], mod, "clAverage5x5Ex");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_MINSQUAREVAL], mod, "clMinSquareValEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_DOMASK], mod, "clDoMaskEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_COMBINECHANNELS], mod, "clCombineChannelsEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_UPSAMPLESQUAREROOT], mod, "clUpsampleSquareRootEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_REMOVEBORDER], mod, "clRemoveBorderEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_ADDBORDER], mod, "clAddBorderEx");
+    cuModuleGetFunction(&ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER], mod, "clComputeBlockZeroingOrderEx");
 
     cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED);
     cuCtxSetSharedMemConfig(CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE);
@@ -94,7 +106,7 @@ ocu_args_d_t& getOcu(void)
     cuStreamCreate(&stream, 0);
 
     ocu.dev = dev;
-    ocu.stream = stream;
+    ocu.commandQueue = stream;
     ocu.mod = mod;
     ocu.ctxt = ctxt;
 
@@ -102,6 +114,10 @@ ocu_args_d_t& getOcu(void)
 }
 
 ocu_args_d_t::ocu_args_d_t()
+    : dev(0)
+    , commandQueue(NULL)
+    , mod(NULL)
+    , ctxt(NULL)
 {
 
 }
@@ -110,7 +126,7 @@ ocu_args_d_t::~ocu_args_d_t()
 {
     cuModuleUnload(mod);
     cuCtxDestroy(ctxt);
-//    cuStreamDestroy(stream);
+//    cuStreamDestroy(commandQueue);
 }
 
 cu_mem ocu_args_d_t::allocMem(size_t s, const void *init)
@@ -119,11 +135,11 @@ cu_mem ocu_args_d_t::allocMem(size_t s, const void *init)
     cuMemAlloc(&mem, s);
     if (init)
     {
-        cuMemcpyHtoD(mem, init, s);
+        cuMemcpyHtoDAsync(mem, init, s, commandQueue);
     }
     else
     {
-        cuMemsetD8(mem, 0, s);
+        cuMemsetD8Async(mem, 0, s, commandQueue);
     }
 
     return mem;
@@ -153,6 +169,68 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb)
 
 const char* TranslateCUDAError(CUresult errorCode)
 {
-    return "Unknwon";
+    switch (errorCode)
+    {
+    case CUDA_SUCCESS: return "CUDA_SUCCESS";
+    case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
+    case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
+    case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
+    case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
+    case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED";
+    case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+    case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+    case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+    case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
+    case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
+    case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
+    case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
+    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+    case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
+    case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
+    case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
+    case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
+    case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+    case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
+    case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
+    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+    case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+    case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
+    case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+    case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
+    case CUDA_ERROR_INVALID_PTX: return "CUDA_ERROR_INVALID_PTX";
+    case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
+    case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
+    case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
+    case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
+    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+    case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM";
+    case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
+    case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
+    case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
+    case CUDA_ERROR_ILLEGAL_ADDRESS: return "CUDA_ERROR_ILLEGAL_ADDRESS";
+    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+    case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
+    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+    case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+    case CUDA_ERROR_ASSERT: return "CUDA_ERROR_ASSERT";
+    case CUDA_ERROR_TOO_MANY_PEERS: return "CUDA_ERROR_TOO_MANY_PEERS";
+    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+    case CUDA_ERROR_HARDWARE_STACK_ERROR: return "CUDA_ERROR_HARDWARE_STACK_ERROR";
+    case CUDA_ERROR_ILLEGAL_INSTRUCTION: return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+    case CUDA_ERROR_MISALIGNED_ADDRESS: return "CUDA_ERROR_MISALIGNED_ADDRESS";
+    case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
+    case CUDA_ERROR_INVALID_PC: return "CUDA_ERROR_INVALID_PC";
+    case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
+    case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
+    case CUDA_ERROR_NOT_SUPPORTED: return "CUDA_ERROR_NOT_SUPPORTED";
+    case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
+    default: return "CUDA_ERROR_UNKNOWN";
+    }
 }
 #endif
\ No newline at end of file
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index 426019cc..dbc42916 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -23,7 +23,7 @@ struct ocu_args_d_t
     void releaseMemChannels(ocu_channels &rgb);
 
     CUfunction  kernel[KERNEL_COUNT];
-    CUstream    stream;
+    CUstream    commandQueue;
     CUmodule    mod;
     CUcontext   ctxt;
     CUdevice    dev;
diff --git a/compile.bat b/compile.bat
index 05a3a361..156ee639 100644
--- a/compile.bat
+++ b/compile.bat
@@ -1,4 +1,7 @@
 @rem setupt windows var
 call vcvars64.bat
 
-nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine 64 -G -g -ptx -o clguetzli\clguetzli.cu.ptx64 clguetzli\clguetzli.cu
\ No newline at end of file
+@echo %1 --machine 64 or 32
+@echo %2  -G 
+
+nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1  clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index fc36b9a0..c8936a47 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -108,13 +108,13 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>ENABLE_OPENCL;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Link>
@@ -174,7 +174,7 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
     </Link>
@@ -396,22 +396,24 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
     </Intel_OpenCL_Build_Rules>
     <CustomBuild Include="clguetzli\clguetzli.cu">
       <FileType>Document</FileType>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compile.bat</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat 64 -G</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ProjectDir)compile.bat 32 -G</Command>
       <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkObjects>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cu.ptx</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat 64 -G</Command>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
       <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ProjectDir)compile.bat 32 -G</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cu.ptx</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CUDA Code Builder</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cu.ptx</Outputs>
     </CustomBuild>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />

From cd2e614be6c963e028ab5413986e7f6adc4e8520 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 4 Jun 2017 23:48:25 +0800
Subject: [PATCH 133/189] =?UTF-8?q?=E6=9B=B4=E6=8D=A2mode=E6=96=B9?=
 =?UTF-8?q?=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp | 158 ++++++++++++++----------------
 clguetzli/clguetzli.cpp           |   4 +-
 clguetzli/clguetzli.h             |  13 ++-
 guetzli.vcxproj                   |   4 +-
 guetzli/guetzli.cc                |  15 ++-
 guetzli/processor.cc              |  14 +--
 6 files changed, 103 insertions(+), 105 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 178e70e9..6e20976a 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -14,18 +14,20 @@ namespace butteraugli
         std::vector<std::vector<float>> &xyb1,
         std::vector<float> &result)
     {
-        if (g_useOpenCL && xsize_ > 100 && ysize_ > 100)
+        if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100)
         {
             result.resize(xsize_ * ysize_);
             clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
-        else if (g_useCuda && xsize_ > 100 && ysize_ > 100)
+#ifdef __USE_CUDA__
+        else if (MODE_CUDA == g_mathMode && xsize_ > 100 && ysize_ > 100)
         {
             result.resize(xsize_ * ysize_);
-            clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+            cuDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
+#endif
         else
         {
             ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result);
@@ -39,7 +41,7 @@ namespace butteraugli
     {
         ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac);
 
-        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
             tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
@@ -55,7 +57,7 @@ namespace butteraugli
     {
         ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map);
 
-        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
             tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
@@ -68,21 +70,19 @@ namespace butteraugli
         const std::vector<std::vector<float> > &xyb1,
         std::vector<float>* block_diff_ac)
     {
-        std::vector<float> orign_ac;
-        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
-        {
-            orign_ac = *block_diff_ac;
-        }
-
-        ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
-
-        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
+            std::vector<float> orign_ac = *block_diff_ac;
+            ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
             tclEdgeDetectorLowFreq(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
                 xsize_, ysize_, step_,
                 orign_ac.data(), (*block_diff_ac).data());
         }
+        else
+        {
+            ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
+        }
     }
 
     void clButteraugliComparator::CombineChannels(const std::vector<std::vector<float> >& mask_xyb,
@@ -92,55 +92,49 @@ namespace butteraugli
         const std::vector<float>& edge_detector_map,
         std::vector<float>* result)
     {
-        std::vector<float> temp;
-        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
-        {
-            temp = *result;
-        }
-
-        ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
-
-        if (g_checkOpenCL && xsize_ > 8 && ysize_ > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
+            std::vector<float> temp = *result;
 			temp.resize(res_xsize_ * res_ysize_);
+            ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
             tclCombineChannels(mask_xyb[0].data(), mask_xyb[1].data(), mask_xyb[2].data(),
                 mask_xyb_dc[0].data(), mask_xyb_dc[1].data(), mask_xyb_dc[2].data(),
                 block_diff_dc.data(),
                 block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]);
         }
+        else
+        {
+            ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
+        }
     }
 
     void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) 
     {
-        std::vector<float> img;
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
+            std::vector<float> img;
             img.resize(xsize * ysize);
             memcpy(img.data(), values, xsize * ysize * sizeof(float));
+            _MinSquareVal(square_size, offset, xsize, ysize, values);
+            tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
         }
-
-        _MinSquareVal(square_size, offset, xsize, ysize, values);
-
-
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        else
         {
-            tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
+            _MinSquareVal(square_size, offset, xsize, ysize, values);
         }
     }
 
     void Average5x5(int xsize, int ysize, std::vector<float>* diffs)
     {
-        std::vector<float> diffs_org;
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
-            diffs_org = *diffs;
+            std::vector<float> diffs_org = *diffs;
+            _Average5x5(xsize, ysize, diffs);
+            tclAverage5x5(xsize, ysize, diffs_org, *diffs);
         }
-
-        _Average5x5(xsize, ysize, diffs);
-
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        else
         {
-            tclAverage5x5(xsize, ysize, diffs_org, *diffs);
+            _Average5x5(xsize, ysize, diffs);
         }
     }
 
@@ -148,7 +142,7 @@ namespace butteraugli
     {
         _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
 
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
         }
@@ -160,7 +154,7 @@ namespace butteraugli
         std::vector<std::vector<float> > *mask,
         std::vector<std::vector<float> > *mask_dc)
     {
-        if (g_useOpenCL && xsize > 100 && ysize > 100)
+        if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
         {
             mask->resize(3);
             mask_dc->resize(3);
@@ -175,10 +169,9 @@ namespace butteraugli
                 xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
                 );
-            return;
         }
 #ifdef __USE_CUDA__
-        else if (g_useCuda && xsize > 100 && ysize > 100)
+        else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100)
         {
             mask->resize(3);
             mask_dc->resize(3);
@@ -193,36 +186,36 @@ namespace butteraugli
                 xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
             );
-            return;
         }
 #endif
-        _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
-
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
+            _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
             tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
                 xsize, ysize,
                 (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
                 (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
         }
+        else
+        {
+            _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+        }
     }
 
     void CalculateDiffmap(const size_t xsize, const size_t ysize,
         const size_t step,
         std::vector<float>* diffmap)
     {
-        std::vector<float> diffmap_org;
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
-            diffmap_org = *diffmap;
+            std::vector<float> diffmap_org = *diffmap;
+            _CalculateDiffmap(xsize, ysize, step, diffmap);
+            tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
         }
-
-        _CalculateDiffmap(xsize, ysize, step, diffmap);
-
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        else
         {
-            tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
+            _CalculateDiffmap(xsize, ysize, step, diffmap);
         }
     }
 
@@ -235,7 +228,7 @@ namespace butteraugli
     {
         _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
 
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
                 c1[0].data(), c1[1].data(), c1[2].data(),
@@ -247,17 +240,15 @@ namespace butteraugli
 
     void ScaleImage(double scale, std::vector<float> *result)
     {
-        std::vector<float> result_org;
-        if (g_checkOpenCL && result->size() > 64)
+        if (MODE_CHECKCL == g_mathMode && result->size() > 64)
         {
-            result_org = *result;
+            std::vector<float> result_org = *result;
+            _ScaleImage(scale, result);
+            tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
         }
-
-        _ScaleImage(scale, result);
-
-        if (g_checkOpenCL && result->size() > 64)
+        else
         {
-            tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
+            _ScaleImage(scale, result);
         }
     }
 
@@ -271,7 +262,7 @@ namespace butteraugli
     {
         _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
 
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
         }
@@ -280,25 +271,24 @@ namespace butteraugli
     void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
         double border_ratio)
     {
-        std::vector<float> orignChannel;
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
+            std::vector<float> orignChannel;
             orignChannel.resize(xsize * ysize);
             memcpy(orignChannel.data(), channel, xsize * ysize * sizeof(float));
+            _Blur(xsize, ysize, channel, sigma, border_ratio);
+            tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
         }
-
-        _Blur(xsize, ysize, channel, sigma, border_ratio);
-
-        if (g_checkOpenCL && xsize > 8 && ysize > 8)
+        else
         {
-            tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
+            _Blur(xsize, ysize, channel, sigma, border_ratio);
         }
     }
 
     void OpsinDynamicsImage(size_t xsize, size_t ysize,
         std::vector<std::vector<float> > &rgb)
     {
-        if (g_useOpenCL && xsize > 100 && ysize > 100)
+        if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
         {
             float * r = rgb[0].data();
             float * g = rgb[1].data();
@@ -307,7 +297,7 @@ namespace butteraugli
             clOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
 #ifdef __USE_CUDA__
-        else if (g_useCuda && xsize > 100 && ysize > 100)
+        else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100)
         {
             float * r = rgb[0].data();
             float * g = rgb[1].data();
@@ -316,21 +306,17 @@ namespace butteraugli
             cuOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
 #endif
-        else
+        else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8)
         {
-            std::vector< std::vector<float>> orig_rgb;
-            if (g_checkOpenCL && xsize > 8 && ysize > 8)
-            {
-                orig_rgb = rgb;
-            }
-
+            std::vector< std::vector<float>> orig_rgb = rgb;
             _OpsinDynamicsImage(xsize, ysize, rgb);
-
-            if (g_checkOpenCL && xsize > 8 && ysize > 8)
-            {
-                tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), xsize, ysize,
+            tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), 
+                    xsize, ysize,
                     rgb[0].data(), rgb[1].data(), rgb[2].data());
-            }
         }  
+        else
+        {
+            _OpsinDynamicsImage(xsize, ysize, rgb);
+        }
     }
 }
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 4495e935..1870a638 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -4,9 +4,7 @@
 #include <vector>
 #include "cl.hpp"
 
-extern bool g_useOpenCL = false;
-extern bool g_useCuda = false;
-extern bool g_checkOpenCL = false;
+extern MATH_MODE g_mathMode = MODE_CPU;
 
 void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index ccdf24a8..cad4ef6e 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -7,9 +7,16 @@
 
 #include "cuguetzli.h"
 
-extern bool g_useOpenCL;
-extern bool g_useCuda;
-extern bool g_checkOpenCL;
+enum MATH_MODE
+{
+    MODE_CPU = 0,
+    MODE_OPENCL,
+    MODE_CUDA,
+    MODE_CHECKCL,
+    MODE_CHECKCUDA
+};
+
+extern MATH_MODE g_mathMode;
 
 void clOpsinDynamicsImage(
     float *r, float *g, float *b, 
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index c8936a47..6c41c349 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -108,7 +108,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>__USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -192,7 +192,7 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>nvrtc.lib;cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 587c06d4..276cb9d6 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -227,6 +227,8 @@ void Usage() {
       "                 the limit. Default limit is %d MB.\n"
 	  "  --opencl     - Use OpenCL\n"
 	  "  --cuda       - Use CUDA\n"	 
+      "  --checkcl    - Check OpenCL result\n"
+      "  --checkcuda  - Check CUDA result\n"
       "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
@@ -260,14 +262,17 @@ int main(int argc, char** argv) {
       memlimit_mb = -1;
 	}
 	else if (!strcmp(argv[opt_idx], "--opencl")) {
-		g_useOpenCL = true;
+		g_mathMode = MODE_OPENCL;
 	}
 	else if (!strcmp(argv[opt_idx], "--cuda")) {
-		g_useCuda = true;
-	}
-	else if (!strcmp(argv[opt_idx], "--checkcl")) {
-		g_checkOpenCL = true;
+		g_mathMode = MODE_CUDA;
 	}
+    else if (!strcmp(argv[opt_idx], "--checkcl")) {
+        g_mathMode = MODE_CHECKCL;
+    }
+    else if (!strcmp(argv[opt_idx], "--checkcuda")) {
+        g_mathMode = MODE_CHECKCUDA;
+    }
 	else if (!strcmp(argv[opt_idx], "--")) {
       opt_idx++;
       break;
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 63ebb609..59b5dff6 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -567,7 +567,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
     CoeffData * output_order = NULL;
     ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
 
-    if (g_useOpenCL || g_checkOpenCL)
+    if (MODE_OPENCL == g_mathMode || MODE_CUDA == g_mathMode)
     {
         channel_info orig_channel[3];
         channel_info mayout_channel[3];
@@ -588,7 +588,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         output_order_gpu.resize(num_blocks * kBlockSize);
         output_order = output_order_gpu.data();
 
-        if (g_useCuda)
+        if (MODE_OPENCL == g_mathMode)
         {
             clComputeBlockZeroingOrder(output_order,
                 orig_channel,
@@ -601,9 +601,10 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
                 comp_mask,
                 comp->BlockErrorLimit());
         }
+#ifdef __USE_CUDA__
         else
         {
-            clComputeBlockZeroingOrder(output_order,
+            cuComputeBlockZeroingOrder(output_order,
                 orig_channel,
                 comp->imgOpsinDynamicsBlockList.data(),
                 comp->imgMaskXyzScaleBlockList.data(),
@@ -614,9 +615,10 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
                 comp_mask,
                 comp->BlockErrorLimit());
         }
-
+#endif
     }
-    if (!g_useOpenCL || g_checkOpenCL)
+
+    if (MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode)
     {
         output_order_cpu.resize(num_blocks * kBlockSize);
         output_order = output_order_cpu.data();
@@ -651,7 +653,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         }
     }
 
-    if (g_checkOpenCL)
+    if (MODE_CHECKCL == g_mathMode)
     {
         int count = 0;
         int check_size = output_order_gpu.size();

From 598603b0217a5701584c0238f4a84d19666ac38a Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 5 Jun 2017 00:00:21 +0800
Subject: [PATCH 134/189] =?UTF-8?q?=E5=BC=82=E6=AD=A5=E6=8B=B7=E8=B4=9D?=
 =?UTF-8?q?=E5=86=85=E5=AD=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/cuguetzli.cpp | 8 ++++----
 guetzli.vcxproj         | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 0a464a77..c4ddc6c7 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -15,9 +15,10 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
 
     cuOpsinDynamicsImageEx(rgb, xsize, ysize);
 
-    cuMemcpyDtoH(r, rgb.r, channel_size);
-    cuMemcpyDtoH(g, rgb.g, channel_size);
-	cuMemcpyDtoH(b, rgb.b, channel_size);
+    cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocl.commandQueue);
+    cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocl.commandQueue);
+	cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocl.commandQueue);
+    cuFinish(ocl.commandQueue);
 
     ocl.releaseMemChannels(rgb);
 }
@@ -143,7 +144,6 @@ void cuComputeBlockZeroingOrder(
         cuMemFree(mem_orig_coeff[c]);
         cuMemFree(mem_mayout_coeff[c]);
         cuMemFree(mem_mayout_pixel[c]);
-
     }
 
     cuMemFree(mem_orig_image);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 6c41c349..8bbeebc2 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -108,7 +108,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>

From 8c29f1fba59e821b1dac29810ffba1b81159cd1e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 5 Jun 2017 19:18:56 +0800
Subject: [PATCH 135/189] =?UTF-8?q?=E5=AE=8C=E6=88=90CUDA=E5=B9=B6?=
 =?UTF-8?q?=E8=A1=8C=E4=BC=98=E5=8C=96=EF=BC=8C=E8=AE=A1=E7=AE=97=E7=BB=93?=
 =?UTF-8?q?=E6=9E=9C=E6=AD=A3=E5=B8=B8=20=E7=9B=AE=E5=89=8D=E9=80=9F?=
 =?UTF-8?q?=E5=BA=A6=E6=AF=94opencl=E7=95=A5=E5=B7=AE=EF=BC=8C=E5=BE=85?=
 =?UTF-8?q?=E5=88=86=E6=9E=90=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl   | 90 ++++++++++++++++++++++++++--------------
 clguetzli/clguetzli.cl.h | 12 +++---
 clguetzli/clguetzli.cpp  | 33 +++++++++------
 clguetzli/cuguetzli.cpp  | 77 +++++++++++++++++++---------------
 compile.bat              |  2 +-
 guetzli.vcxproj          | 14 ++++---
 6 files changed, 141 insertions(+), 87 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 1e026fa9..8df441e9 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -106,6 +106,7 @@ __kernel void clConvolutionEx(
 
 __kernel void clConvolutionXEx(
 	__global float* result,
+    const int xsize, const int ysize,
 	__global const float* inp,
 	__global const float* multipliers, const int len, 
 	const int step, const int offset, const float border_ratio)
@@ -113,10 +114,12 @@ __kernel void clConvolutionXEx(
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
+    if (x >= xsize || y >= ysize) return;
+
     if (x % step != 0) return;
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+//    const int xsize = get_global_size(0);
+//    const int ysize = get_global_size(1);
 
     float weight_no_border = 0;
     for (int j = 0; j <= 2 * offset; j++)
@@ -147,6 +150,7 @@ __kernel void clConvolutionXEx(
 
 __kernel void clConvolutionYEx(
 	__global float* result,
+    const int xsize, const int ysize,
 	__global const float* inp, 
 	__global const float* multipliers, const int len, 
     const int step, const int offset, const float border_ratio)
@@ -154,11 +158,12 @@ __kernel void clConvolutionYEx(
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
+    if (x >= xsize || y >= ysize) return;
     if (x % step != 0) return;
     if (y % step != 0) return;
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+//    const int xsize = get_global_size(0);
+//    const int ysize = get_global_size(1);
 
     float weight_no_border = 0;
     for (int j = 0; j <= 2 * offset; j++)
@@ -189,28 +194,33 @@ __kernel void clConvolutionYEx(
 
 __kernel void clSquareSampleEx(
 	__global float* result,
+    const int xsize, const int ysize,
 	__global const float* image, 
 	const int xstep, const int ystep)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
+    if (x >= xsize || y >= ysize) return;
 
     int x_sample = x - x % xstep;
     int y_sample = y - y % ystep;
 
     if (x_sample == x && y_sample == y) return;
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+//    const int xsize = get_global_size(0);
+//    const int ysize = get_global_size(1);
 
     result[y * xsize + x] = image[y_sample * xsize + x_sample];
 }
 
 __kernel void clOpsinDynamicsImageEx(
     __global float *r, __global float *g, __global float *b,
+    const int size,
     __global const float *r_blurred, __global const float *g_blurred, __global const float *b_blurred)
 {
     const int i = get_global_id(0);
+    if (i >= size) return;
+
     double pre[3] = { r_blurred[i], g_blurred[i],  b_blurred[i] };
     double pre_mixed[3];
     OpsinAbsorbance(pre, pre_mixed);
@@ -236,6 +246,7 @@ __kernel void clOpsinDynamicsImageEx(
 
 __kernel void clMaskHighIntensityChangeEx(
     __global float *xyb0_x, __global float *xyb0_y, __global float *xyb0_b,
+    const int xsize, const int ysize,
     __global float *xyb1_x, __global float *xyb1_y, __global float *xyb1_b,
     __global const float *c0_x, __global const float *c0_y, __global const float *c0_b,
     __global const float *c1_x, __global const float *c1_y, __global const float *c1_b
@@ -243,8 +254,9 @@ __kernel void clMaskHighIntensityChangeEx(
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    if (x >= xsize || y >= ysize) return;
+//    const int xsize = get_global_size(0);
+    //const int ysize = get_global_size(1);
 
     size_t ix = y * xsize + x;
     const double ave[3] = {
@@ -327,6 +339,7 @@ __kernel void clEdgeDetectorMapEx(
 
 __kernel void clBlockDiffMapEx(
 	__global float* block_diff_dc, __global float* block_diff_ac,
+    const int res_xsize, const int res_ysize,
 	__global const float* r, __global const float* g, __global const float* b,
     __global const float* r2, __global const float* g2, __global const float* b2,
     int xsize, int ysize, int step)
@@ -334,8 +347,10 @@ __kernel void clBlockDiffMapEx(
     const int res_x = get_global_id(0);
     const int res_y = get_global_id(1);
 
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
+//    const int res_xsize = get_global_size(0);
+//    const int res_ysize = get_global_size(1);
+
+    if (res_x >= res_xsize || res_y >= res_ysize) return;
 
     int pos_x = res_x * step;
     int pos_y = res_y * step;
@@ -450,13 +465,15 @@ __kernel void clEdgeDetectorLowFreqEx(
 
 __kernel void clDiffPrecomputeEx(
     __global float *mask_x, __global float *mask_y, __global float *mask_b,
+    const int xsize, const int ysize,
     __global const float *xyb0_x, __global const float *xyb0_y, __global const float *xyb0_b,
     __global const float *xyb1_x, __global const float *xyb1_y, __global const float *xyb1_b)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    if (x >= xsize || y >= ysize) return;
+//    const int xsize = get_global_size(0);
+    //const int ysize = get_global_size(1);
 
     double valsh0[3] = { 0.0 };
     double valsv0[3] = { 0.0 };
@@ -514,20 +531,23 @@ __kernel void clDiffPrecomputeEx(
     mask_b[ix] = (float)(m);
 }
 
-__kernel void clScaleImageEx(__global float *img, double scale)
+__kernel void clScaleImageEx(__global float *img, const int size, float scale)
 {
     const int i = get_global_id(0);
+    if (i >= size) return;
+
     img[i] *= scale;
 }
 
 #define Average5x5_w 0.679144890667f
 __constant float Average5x5_scale = 1.0f / (5.0f + 4 * Average5x5_w);
-__kernel void clAverage5x5Ex(__global float *img, __global const float *img_org)
+__kernel void clAverage5x5Ex(__global float *img, const int xsize, const int ysize, __global const float *img_org)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    if (x >= xsize || y >= ysize) return;
+//    const int xsize = get_global_size(0);
+//    const int ysize = get_global_size(1);
 	
     const int row0 = y * xsize;
 	if (x - 1 >= 0) {
@@ -562,31 +582,33 @@ __kernel void clAverage5x5Ex(__global float *img, __global const float *img_org)
 	img[row0 + x] *= Average5x5_scale;
 }
 
-__kernel void clMinSquareValEx(__global float* result, __global const float* img,  int square_size, int offset)
+__kernel void clMinSquareValEx(__global float* result, const int xsize, const int ysize, __global const float* img,  int square_size, int offset)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int width = get_global_size(0);
-    const int height = get_global_size(1);
+
+    if (x >= xsize || y >= ysize) return;
+//    const int width = get_global_size(0);
+//    const int height = get_global_size(1);
 
     int minH = offset > y ? 0 : y - offset;
-    int maxH = min(y + square_size - offset, height);
+    int maxH = min(y + square_size - offset, ysize);
 
     int minW = offset > x ? 0 : x - offset;
-    int maxW = min(x + square_size - offset, width);
+    int maxW = min(x + square_size - offset, xsize);
 
-    float minValue = img[minH * width + minW];
+    float minValue = img[minH * xsize + minW];
 
     for (int j = minH; j < maxH; j++)
     {
         for (int i = minW; i < maxW; i++)
         {
-            float tmp = img[j * width + i];
+            float tmp = img[j * xsize + i];
             if (tmp < minValue) minValue = tmp;
         }
     }
 
-    result[y * width + x] = minValue;
+    result[y * xsize + x] = minValue;
 }
 
 __kernel void clDoMaskEx(
@@ -723,6 +745,8 @@ __kernel void clComputeBlockZeroingOrderEx(
     __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
     __global const float   *orig_image_batch,   // ԭʼͼ��pregamma
     __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
+    const int              block_xsize,
+    const int              block_ysize,
     const int              image_width,
     const int              image_height,
 
@@ -744,6 +768,8 @@ __kernel void clComputeBlockZeroingOrderEx(
     const int block_x = get_global_id(0);
     const int block_y = get_global_id(1);
 
+    if (block_x >= block_xsize || block_y >= block_ysize) return;
+
     channel_info orig_channel[3];
     orig_channel[0].coeff = orig_batch_0;
     orig_channel[1].coeff = orig_batch_1;
@@ -3151,16 +3177,16 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
         candidate_channel[c] = &candidate_block[c * 8 * 8];
     }
 
-    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+//    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
     uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
 
     for (int c = 0; c < 3; c++)
     {
-        if (mayout_channel[c].factor == 1) {
-            if (factor == 1) {
+//        if (mayout_channel[c].factor == 1) {
+  //          if (factor == 1) {
                 const coeff_t *coeff_block = candidate_channel[c];
                 CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
-            }
+   /*         }
             else {
                 for (int iy = 0; iy < factor; ++iy) {
                     for (int ix = 0; ix < factor; ++ix) {
@@ -3182,7 +3208,8 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                     }
                 }
             }
-        }
+*/
+ /*       }
         else { 
             if (factor == 1) {
                 int block_xx = block_x / mayout_channel[c].factor;
@@ -3209,9 +3236,10 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                     image_height);
             }
         }
+*/
     }
 
-    if (factor == 1)
+  //  if (factor == 1)
     {
         float rgb0_c[3][kDCTBlockSize];
         int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0);
@@ -3224,6 +3252,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
 
         return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
     }
+/*
     else
     {
         int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16;
@@ -3255,5 +3284,6 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
         }
         return max_err;
     }
+*/
 }
 
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 6dec83c8..2a8ed044 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -117,9 +117,9 @@
     {
         switch (dim)
         {
-        case 0:  return blockIdx.x;
-        case 1:  return blockIdx.y;
-        default: return blockIdx.z;
+        case 0:  return blockIdx.x * blockDim.x + threadIdx.x;
+        case 1:  return blockIdx.y * blockDim.y + threadIdx.y;
+        default: return blockIdx.z * blockDim.z + threadIdx.z;
         }
     }
 
@@ -127,9 +127,9 @@
     {
         switch(dim)
         {
-        case 0: return gridDim.x;
-        case 1: return gridDim.y;
-        default: return gridDim.z;
+        case 0: return gridDim.x * blockDim.x;
+        case 1: return gridDim.y * blockDim.y;
+        default: return gridDim.z * blockDim.z;
         }
     }
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 1870a638..11cb16ed 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -119,6 +119,7 @@ void clComputeBlockZeroingOrder(
     cl_kernel kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     clSetKernelArgEx(kernel, &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
                         &mem_orig_image, &mem_mask_scale, 
+						&blockf_width, &blockf_height,
                         &image_width, &image_height,
                         &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
                         &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
@@ -210,7 +211,7 @@ void clConvolutionXEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
-    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -228,7 +229,7 @@ void clConvolutionYEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
-    clSetKernelArgEx(kernel, &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -245,7 +246,7 @@ void clSquareSampleEx(
 	ocl_args_d_t &ocl = getOcl();
 
 	cl_kernel kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
-    clSetKernelArgEx(kernel, &result, &image, &xstep, &ystep);
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &image, &xstep, &ystep);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -301,12 +302,14 @@ void clOpsinDynamicsImageEx(ocl_channels &rgb, const size_t xsize, const size_t
 	ocl_args_d_t &ocl = getOcl();
 	ocl_channels rgb_blurred = ocl.allocMemChannels(channel_size);
 
-	clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
-	clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
-	clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
+    const int size = xsize * ysize;
+
+    clBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
+    clBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
+    clBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
-    clSetKernelArgEx(kernel,  &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
+    clSetKernelArgEx(kernel,  &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b);
 
 	size_t globalWorkSize[1] = { xsize * ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -340,6 +343,7 @@ void clMaskHighIntensityChangeEx(
 	cl_kernel kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
     clSetKernelArgEx(kernel, 
 		&xyb0.r, &xyb0.g, &xyb0.b,
+		&xsize, &ysize,
     	&xyb1.r, &xyb1.g, &xyb1.b,
         &c0.r, &c0.g, &c0.b,
         &c1.r, &c1.g, &c1.b);
@@ -401,14 +405,17 @@ void clBlockDiffMapEx(
 {
 	ocl_args_d_t &ocl = getOcl();
 
+
+	const size_t res_xsize = (xsize + step - 1) / step;
+	const size_t res_ysize = (ysize + step - 1) / step;
+	
 	cl_kernel kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
     clSetKernelArgEx(kernel, &block_diff_dc, &block_diff_ac,
+		&res_xsize, &res_ysize,
         &rgb.r, &rgb.g, &rgb.b,
         &rgb2.r, &rgb2.g, &rgb2.b,
         &xsize, &ysize, &step);
 
-	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (ysize + step - 1) / step;
 
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -463,6 +470,7 @@ void clDiffPrecomputeEx(
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
     clSetKernelArgEx(kernel, &mask.x, &mask.y, &mask.b, 
+							&xsize, &ysize,
                             &xyb0.x, &xyb0.y, &xyb0.b,
 							&xyb1.x, &xyb1.y, &xyb1.b);
 
@@ -476,9 +484,10 @@ void clDiffPrecomputeEx(
 void clScaleImageEx(cl_mem img/*in, out*/, size_t size, double w)
 {
 	ocl_args_d_t &ocl = getOcl();
+    float fw = w;
 
 	cl_kernel kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-	clSetKernelArgEx(kernel, &img, &w);
+	clSetKernelArgEx(kernel, &img, &size, &fw);
 
 	size_t globalWorkSize[1] = { size };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -502,7 +511,7 @@ void clAverage5x5Ex(cl_mem img/*in,out*/, const size_t xsize, const size_t ysize
     clEnqueueCopyBuffer(ocl.commandQueue, img, img_org, 0, 0, len, 0, NULL, NULL);
 
     cl_kernel kernel = ocl.kernel[KERNEL_AVERAGE5X5];
-    clSetKernelArgEx(kernel, &img, &img_org);
+    clSetKernelArgEx(kernel, &img, &xsize, &ysize, &img_org);
 
     size_t globalWorkSize[2] = { xsize, ysize };
     cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
@@ -523,7 +532,7 @@ void clMinSquareValEx(
 	cl_mem result = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
 
 	cl_kernel kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
-    clSetKernelArgEx(kernel, &result, &img, &square_size, &offset);
+    clSetKernelArgEx(kernel, &result, &xsize, &ysize, &img, &square_size, &offset);
 
 	size_t globalWorkSize[2] = { xsize, ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index c4ddc6c7..80199dea 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -5,6 +5,10 @@
 #ifdef __USE_CUDA__
 
 #define cuFinish cuStreamSynchronize
+#define BLOCK_SIZE_X 16
+#define BLOCK_SIZE_Y 12
+#define BLOCK_COUNT_X(size)    ((size + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X)
+#define BLOCK_COUNT_Y(size)    ((size + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y)
 
 void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
@@ -118,6 +122,7 @@ void cuComputeBlockZeroingOrder(
     CUfunction kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
         &mem_orig_image, &mem_mask_scale,
+        &blockf_width, &blockf_height,
         &image_width, &image_height,
         &mem_mayout_coeff[0], &mem_mayout_coeff[1], &mem_mayout_coeff[2],
         &mem_mayout_pixel[0], &mem_mayout_pixel[1], &mem_mayout_pixel[2],
@@ -128,8 +133,8 @@ void cuComputeBlockZeroingOrder(
         &mem_output_order_batch };
 
     CUresult err = cuLaunchKernel(kernel,
-        blockf_width, blockf_height, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(blockf_width), BLOCK_COUNT_Y(blockf_height), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
     LOG_CU_RESULT(err);
@@ -216,11 +221,11 @@ void cuConvolutionXEx(
     ocu_args_d_t &ocl = getOcu();
 
 	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
-    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+    const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -237,11 +242,11 @@ void cuConvolutionYEx(
     ocu_args_d_t &ocl = getOcu();
 
 	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
-    const void *args[] = { &result, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
+    const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -257,11 +262,11 @@ void cuSquareSampleEx(
     ocu_args_d_t &ocl = getOcu();
 
 	CUfunction kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
-    const void *args[] = { &result, &image, &xstep, &ystep };
+    const void *args[] = { &result, &xsize, &ysize, &image, &xstep, &ystep };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -316,18 +321,20 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
     ocu_args_d_t &ocl = getOcu();
     ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size);
 
+    const int size = xsize * ysize;
+
     cuBlurEx(rgb.r, xsize, ysize, kSigma, 0.0, rgb_blurred.r);
     cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
     cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
 	CUfunction kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
-    void *args[] = { &rgb.r, &rgb.g, &rgb.b, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
+    const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize * ysize, 1, 1,
-        1, 1, 1,
+        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
         0,
-        ocl.commandQueue, args, NULL);
+        ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
     err = cuFinish(ocl.commandQueue);
 	LOG_CU_RESULT(err);
@@ -358,13 +365,14 @@ void cuMaskHighIntensityChangeEx(
 	CUfunction kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
     const void *args[] = { 
 		&xyb0.r, &xyb0.g, &xyb0.b,
+        &xsize, &ysize,
         &xyb1.r, &xyb1.g, &xyb1.b,
         &c0.r, &c0.g, &c0.b,
         &c1.r, &c1.g, &c1.b };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -425,18 +433,19 @@ void cuBlockDiffMapEx(
 {
     ocu_args_d_t &ocl = getOcu();
 
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
 	CUfunction kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
     const void *args[] = { &block_diff_dc, &block_diff_ac,
+        &res_xsize, &res_ysize,
         &rgb.r, &rgb.g, &rgb.b,
         &rgb2.r, &rgb2.g, &rgb2.b,
         &xsize, &ysize, &step };
 
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
     CUresult err = cuLaunchKernel(kernel,
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -494,12 +503,13 @@ void cuDiffPrecomputeEx(
 
 	CUfunction kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
     const void *args[] = { &mask.x, &mask.y, &mask.b,
+        &xsize, &ysize,
         &xyb0.x, &xyb0.y, &xyb0.b,
         &xyb1.x, &xyb1.y, &xyb1.b };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -510,13 +520,14 @@ void cuDiffPrecomputeEx(
 void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
 {
     ocu_args_d_t &ocl = getOcu();
+    float fw = w;
 
 	CUfunction kernel = ocl.kernel[KERNEL_SCALEIMAGE];
-    const void *args[] = { &img, &w };
+    const void *args[] = { &img, &size, &fw };
 
     CUresult err = cuLaunchKernel(kernel,
-        size, 1, 1,
-        1, 1, 1,
+        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -539,11 +550,11 @@ void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize
     cuMemcpyDtoD(img_org, img, len);
 
 	CUfunction kernel = ocl.kernel[KERNEL_AVERAGE5X5];
-    const void *args[] = { &img, &img_org };
+    const void *args[] = { &img, &xsize, &ysize, &img_org };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -563,11 +574,11 @@ void cuMinSquareValEx(
     cu_mem result = ocl.allocMem(sizeof(float) * xsize * ysize);
 
 	CUfunction kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
-    const void *args[] = { &result, &img, &square_size, &offset };
+    const void *args[] = { &result, &xsize, &ysize, &img, &square_size, &offset };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
diff --git a/compile.bat b/compile.bat
index 156ee639..4d462695 100644
--- a/compile.bat
+++ b/compile.bat
@@ -4,4 +4,4 @@ call vcvars64.bat
 @echo %1 --machine 64 or 32
 @echo %2  -G 
 
-nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --fmad=false --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1  clguetzli\clguetzli.cu
\ No newline at end of file
+nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1  clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 8bbeebc2..f0e3fafe 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -156,7 +156,8 @@
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
-      <Command>compile.bat</Command>
+      <Command>
+      </Command>
     </PostBuildEvent>
     <CustomBuild>
       <Message>CUDA CU</Message>
@@ -396,24 +397,27 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </Intel_OpenCL_Build_Rules>
     <CustomBuild Include="clguetzli\clguetzli.cu">
       <FileType>Document</FileType>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CUDA Code Builder</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat 64 -G</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ProjectDir)compile.bat 32 -G</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ProjectDir)compile.bat 64</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ProjectDir)compile.bat 32</Command>
       <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkObjects>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cu.ptx</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">clguetzli\clguetzli.cu.ptx64</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat 64 -G</Command>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
       <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ProjectDir)compile.bat 32 -G</Command>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CUDA Code Builder</Message>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cu.ptx</Outputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CUDA Code Builder</Message>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cu.ptx</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
     </CustomBuild>
     <None Include="third_party\libpng\pngwin.def" />
     <None Include="third_party\zlib\inffas32.asm" />

From d13a9bad949cf1a3e0c80fe7745eecf04c269a0f Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 5 Jun 2017 19:20:46 +0800
Subject: [PATCH 136/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E5=91=BD=E4=BB=A4?=
 =?UTF-8?q?=E8=A1=8C=E6=8F=90=E7=A4=BA=EF=BC=8CMax=20Thread=20Per=20MP?=
 =?UTF-8?q?=E5=92=8CSP=E6=98=AF=E4=B8=8D=E4=B8=80=E6=A0=B7=E7=9A=84?=
 =?UTF-8?q?=E6=A6=82=E5=BF=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/ocu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 8923bcd4..137c28bd 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -31,7 +31,7 @@ ocu_args_d_t& getOcu(void)
     cuDeviceGetAttribute(&cap_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
     cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
     cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
-    LogError("CUDA Adapter:%s Ver%d.%d MP %d Core %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
+    LogError("CUDA Adapter:%s Ver%d.%d MP %d MaxThread Per MP %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
 /*
     char* source = nullptr;
     size_t src_size = 0;

From f9ba50ebfa9223172539fcdf3046e44b5dc6a564 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 5 Jun 2017 20:17:03 +0800
Subject: [PATCH 137/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E5=8F=82=E6=95=B0?=
 =?UTF-8?q?=E8=AF=95=E8=AF=95=E6=80=A7=E8=83=BD=E6=83=85=E5=86=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  |  5 +++--
 clguetzli/clguetzli.cpp |  1 +
 clguetzli/cuguetzli.cpp | 19 ++++++++++++-------
 guetzli.vcxproj         |  2 +-
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 8df441e9..6ddd429c 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -613,6 +613,7 @@ __kernel void clMinSquareValEx(__global float* result, const int xsize, const in
 
 __kernel void clDoMaskEx(
     __global float *mask_x, __global float *mask_y, __global float *mask_b,
+    const int xsize, const int ysize,
     __global float *mask_dc_x, __global float *mask_dc_y, __global float *mask_dc_b,
     __global const double *lut_x, __global const double *lut_y, __global const double *lut_b,
     __global const double *lut_dc_x, __global const double *lut_dc_y, __global const double *lut_dc_b)
@@ -620,8 +621,8 @@ __kernel void clDoMaskEx(
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+//    const int xsize = get_global_size(0);
+//    const int ysize = get_global_size(1);
 
 	const double w00 = 232.206464018;
 	const double w11 = 22.9455222245;
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 11cb16ed..a81b1189 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -646,6 +646,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
 
 	cl_kernel kernel = ocl.kernel[KERNEL_DOMASK];
     clSetKernelArgEx(kernel, &mask.r, &mask.g, &mask.b,
+        &xsize, &ysize,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
         &xyb.x, &xyb.y, &xyb.b,
         &xyb_dc.x, &xyb_dc.y, &xyb_dc.b);
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 80199dea..8df19d01 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -6,7 +6,7 @@
 
 #define cuFinish cuStreamSynchronize
 #define BLOCK_SIZE_X 16
-#define BLOCK_SIZE_Y 12
+#define BLOCK_SIZE_Y 16
 #define BLOCK_COUNT_X(size)    ((size + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X)
 #define BLOCK_COUNT_Y(size)    ((size + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y)
 
@@ -331,8 +331,10 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
     const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
 
     CUresult err = cuLaunchKernel(kernel,
-        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
-        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+//        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+//        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        (size + 511) / 512, 1, 1,
+        512, 1, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -526,8 +528,10 @@ void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
     const void *args[] = { &img, &size, &fw };
 
     CUresult err = cuLaunchKernel(kernel,
-        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
-        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
+//        (size + BLOCK_SIZE_X * BLOCK_SIZE_Y - 1) / BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        (size + 511) / 512, 1, 1,
+//        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
+        512, 1, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -690,13 +694,14 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz
 
 	CUfunction kernel = ocl.kernel[KERNEL_DOMASK];
     const void *args[] = { &mask.r, &mask.g, &mask.b,
+        &xsize, &ysize,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
         &xyb.x, &xyb.y, &xyb.b,
         &xyb_dc.x, &xyb_dc.y, &xyb_dc.b };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index f0e3fafe..e6070b25 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -397,7 +397,7 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
       </Command>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
     </Intel_OpenCL_Build_Rules>
     <CustomBuild Include="clguetzli\clguetzli.cu">
       <FileType>Document</FileType>

From cce5bc3fd303a416b5fdbf7d5851d7895f5233e7 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 09:23:31 +0800
Subject: [PATCH 138/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A364=E3=80=8132?=
 =?UTF-8?q?=E4=BD=8D=E5=88=A4=E6=96=AD=E7=9A=84=E5=AE=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/ocu.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 137c28bd..3a1b695f 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -64,11 +64,10 @@ ocu_args_d_t& getOcu(void)
 
     char* ptx = nullptr;
     size_t src_size = 0;
-#ifdef _WIN64
+if (sizeof(void*) == 8)
     ReadSourceFromFile("clguetzli/clguetzli.cu.ptx64", &ptx, &src_size);
-#else
+else
     ReadSourceFromFile("clguetzli/clguetzli.cu.ptx32", &ptx, &src_size);
-#endif
 
     CUmodule mod;
     CUjit_option jit_options[2];

From 3237a5006e89943a17c57579fdfb04d690aa925b Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 10:10:01 +0800
Subject: [PATCH 139/189] =?UTF-8?q?=E4=BC=98=E5=8C=96=20cuEdgeDetectorMapE?=
 =?UTF-8?q?x=20cuEdgeDetectorLowFreqEx=20cuRemoveBorderEx=20cuAddBorderEx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl  | 44 +++++++++--------------------------------
 clguetzli/clguetzli.cpp | 23 ++++++++++++---------
 clguetzli/cuguetzli.cpp | 38 ++++++++++++++++++++---------------
 3 files changed, 45 insertions(+), 60 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 6ddd429c..c6648638 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -118,9 +118,6 @@ __kernel void clConvolutionXEx(
 
     if (x % step != 0) return;
 
-//    const int xsize = get_global_size(0);
-//    const int ysize = get_global_size(1);
-
     float weight_no_border = 0;
     for (int j = 0; j <= 2 * offset; j++)
     {
@@ -162,9 +159,6 @@ __kernel void clConvolutionYEx(
     if (x % step != 0) return;
     if (y % step != 0) return;
 
-//    const int xsize = get_global_size(0);
-//    const int ysize = get_global_size(1);
-
     float weight_no_border = 0;
     for (int j = 0; j <= 2 * offset; j++)
     {
@@ -207,9 +201,6 @@ __kernel void clSquareSampleEx(
 
     if (x_sample == x && y_sample == y) return;
 
-//    const int xsize = get_global_size(0);
-//    const int ysize = get_global_size(1);
-
     result[y * xsize + x] = image[y_sample * xsize + x_sample];
 }
 
@@ -255,8 +246,6 @@ __kernel void clMaskHighIntensityChangeEx(
     const int x = get_global_id(0);
     const int y = get_global_id(1);
     if (x >= xsize || y >= ysize) return;
-//    const int xsize = get_global_size(0);
-    //const int ysize = get_global_size(1);
 
     size_t ix = y * xsize + x;
     const double ave[3] = {
@@ -305,6 +294,7 @@ __kernel void clMaskHighIntensityChangeEx(
 
 __kernel void clEdgeDetectorMapEx(
 	__global float *result,
+    const int res_xsize, const int res_ysize,
     __global const float *r, __global const float *g, __global const float* b,
     __global const float *r2, __global const float* g2, __global const float *b2,
     int xsize, int ysize, int step)
@@ -312,8 +302,7 @@ __kernel void clEdgeDetectorMapEx(
     const int res_x = get_global_id(0);
     const int res_y = get_global_id(1);
 
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
+    if (res_x >= res_xsize || res_y >= res_ysize) return;
 
     int pos_x = res_x * step;
     int pos_y = res_y * step;
@@ -347,9 +336,6 @@ __kernel void clBlockDiffMapEx(
     const int res_x = get_global_id(0);
     const int res_y = get_global_id(1);
 
-//    const int res_xsize = get_global_size(0);
-//    const int res_ysize = get_global_size(1);
-
     if (res_x >= res_xsize || res_y >= res_ysize) return;
 
     int pos_x = res_x * step;
@@ -400,6 +386,7 @@ __kernel void clBlockDiffMapEx(
 
 __kernel void clEdgeDetectorLowFreqEx(
 	__global float *block_diff_ac,
+    const int res_xsize, const int res_ysize, 
     __global const float *r, __global const float *g, __global const float* b,
     __global const float *r2, __global const float* g2, __global const float *b2,
     int xsize, int ysize, int step_)
@@ -407,12 +394,11 @@ __kernel void clEdgeDetectorLowFreqEx(
     const int res_x = get_global_id(0);
     const int res_y = get_global_id(1);
 
+    if (res_x >= res_xsize || res_y >= res_ysize) return;
+
 	const int step = 8;
     if (res_x < step / step_) return;
 
-    const int res_xsize = get_global_size(0);
-    const int res_ysize = get_global_size(1);
-
     int x = (res_x - (step / step_)) * step_;
     int y = res_y * step_;
 
@@ -472,8 +458,6 @@ __kernel void clDiffPrecomputeEx(
     const int x = get_global_id(0);
     const int y = get_global_id(1);
     if (x >= xsize || y >= ysize) return;
-//    const int xsize = get_global_size(0);
-    //const int ysize = get_global_size(1);
 
     double valsh0[3] = { 0.0 };
     double valsv0[3] = { 0.0 };
@@ -546,9 +530,7 @@ __kernel void clAverage5x5Ex(__global float *img, const int xsize, const int ysi
     const int x = get_global_id(0);
     const int y = get_global_id(1);
     if (x >= xsize || y >= ysize) return;
-//    const int xsize = get_global_size(0);
-//    const int ysize = get_global_size(1);
-	
+
     const int row0 = y * xsize;
 	if (x - 1 >= 0) {
 		img[row0 + x] += img_org[row0 + x - 1];
@@ -588,8 +570,6 @@ __kernel void clMinSquareValEx(__global float* result, const int xsize, const in
     const int y = get_global_id(1);
 
     if (x >= xsize || y >= ysize) return;
-//    const int width = get_global_size(0);
-//    const int height = get_global_size(1);
 
     int minH = offset > y ? 0 : y - offset;
     int maxH = min(y + square_size - offset, ysize);
@@ -621,9 +601,6 @@ __kernel void clDoMaskEx(
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
-//    const int xsize = get_global_size(0);
-//    const int ysize = get_global_size(1);
-
 	const double w00 = 232.206464018;
 	const double w11 = 22.9455222245;
 	const double w22 = 503.962310606;
@@ -710,23 +687,20 @@ __kernel void clUpsampleSquareRootEx(__global float *diffmap_out, __global const
     }
 }
 
-__kernel void clRemoveBorderEx(__global float *out, __global const float *in, int in_xsize, int s, int s2)
+__kernel void clRemoveBorderEx(__global float *out, const int xsize, const int ysize, __global const float *in, int s, int s2)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
+    if (x >= xsize || y >= ysize) return;
 
     out[y * xsize + x] = in[(y + s2) * (xsize + s) + x + s2];
 }
 
-__kernel void clAddBorderEx(__global float *out, int s, int s2, __global const float *in)
+__kernel void clAddBorderEx(__global float *out, const int xsize, const int ysize, int s, int s2, __global const float *in)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int xsize = get_global_size(0);
-    const int ysize = get_global_size(1);
 
 	if (x >= xsize - s ||
 	    y >= ysize - s)
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index a81b1189..2bee09be 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -378,15 +378,16 @@ void clEdgeDetectorMapEx(
 		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
 	}
 
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
 	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
     clSetKernelArgEx(kernel, &result,
+        &res_xsize, &res_ysize,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
         &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
         &xsize, &ysize, &step);
 
-	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (ysize + step - 1) / step;
-
 	size_t globalWorkSize[2] = { res_xsize, res_ysize};
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
     LOG_CL_RESULT(err);
@@ -442,15 +443,16 @@ void clEdgeDetectorLowFreqEx(
 		clBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
 	}
 
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
 	cl_kernel kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
     clSetKernelArgEx(kernel, &block_diff_ac,
+        &res_xsize, &res_ysize,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
         &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
         &xsize, &ysize, &step);
 
-	const size_t res_xsize = (xsize + step - 1) / step;
-	const size_t res_ysize = (ysize + step - 1) / step;
-
 	size_t globalWorkSize[2] = { res_xsize, res_ysize };
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
     LOG_CL_RESULT(err);
@@ -753,10 +755,13 @@ void clRemoveBorderEx(cl_mem out, const cl_mem in, const size_t xsize, const siz
 	cl_int cls = 8 - step;
 	cl_int cls2 = (8 - step) / 2;
 
+    int out_xsize = xsize - cls;
+    int out_ysize = ysize - cls;
+
 	cl_kernel kernel = ocl.kernel[KERNEL_REMOVEBORDER];
-    clSetKernelArgEx(kernel, &out, &in, &xsize, &cls, &cls2);
+    clSetKernelArgEx(kernel, &out, &out_xsize, &out_ysize, &in, &cls, &cls2);
 
-	size_t globalWorkSize[2] = { xsize - cls, ysize - cls};
+	size_t globalWorkSize[2] = { out_xsize, out_ysize};
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
     LOG_CL_RESULT(err);
 	err = clFinish(ocl.commandQueue);
@@ -770,7 +775,7 @@ void clAddBorderEx(cl_mem out, size_t xsize, size_t ysize, int step, cl_mem in)
     cl_int cls = 8 - step;
     cl_int cls2 = (8 - step) / 2;
 	cl_kernel kernel = ocl.kernel[KERNEL_ADDBORDER];
-    clSetKernelArgEx(kernel, &out, &cls, &cls2, &in);
+    clSetKernelArgEx(kernel, &out, &xsize, &ysize, &cls, &cls2, &in);
 
 	size_t globalWorkSize[2] = { xsize, ysize};
 	cl_int err = clEnqueueNDRangeKernel(ocl.commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 8df19d01..b73967d9 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -405,18 +405,19 @@ void cuEdgeDetectorMapEx(
         cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma[i], 0.0, rgb2_blured.ch[i]);
     }
 
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
 	CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
     const void *args[] = { &result,
+        &res_xsize, &res_ysize,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
         &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
         &xsize, &ysize, &step };
 
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
     CUresult err = cuLaunchKernel(kernel,
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -474,18 +475,20 @@ void cuEdgeDetectorLowFreqEx(
         cuBlurEx(rgb2.ch[i], xsize, ysize, kSigma, 0.0, rgb2_blured.ch[i]);
     }
 
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
 	CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
     const void *args[] = { &block_diff_ac,
+        &res_xsize, &res_ysize,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
         &rgb2_blured.r, &rgb2_blured.g, &rgb2_blured.b,
         &xsize, &ysize, &step };
 
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
+    
     CUresult err = cuLaunchKernel(kernel,
-        res_xsize, res_ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -809,12 +812,15 @@ void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const siz
     int cls = 8 - step;
     int cls2 = (8 - step) / 2;
 
+    int out_xsize = xsize - cls;
+    int out_ysize = ysize - cls;
+
 	CUfunction kernel = ocl.kernel[KERNEL_REMOVEBORDER];
-    const void *args[] = { &out, &in, &xsize, &cls, &cls2 };
+    const void *args[] = { &out, &out_xsize, &out_ysize, &in, &cls, &cls2 };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize - cls, ysize - cls, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(out_xsize), BLOCK_COUNT_Y(out_ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
@@ -829,11 +835,11 @@ void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in)
     int cls = 8 - step;
     int cls2 = (8 - step) / 2;
 	CUfunction kernel = ocl.kernel[KERNEL_ADDBORDER];
-    const void *args[] = { &out, &cls, &cls2, &in };
+    const void *args[] = { &out, &xsize, &ysize, &cls, &cls2, &in };
 
     CUresult err = cuLaunchKernel(kernel,
-        xsize, ysize, 1,
-        1, 1, 1,
+        BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
+        BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
         ocl.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);

From 9f8597d749176e783a6a722914c471a8707bc1ea Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 10:28:16 +0800
Subject: [PATCH 140/189] =?UTF-8?q?=E6=81=A2=E5=A4=8Dfactor=3D2=E7=9A=84?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=EF=BC=8C=E6=80=A7=E8=83=BD=E5=B7=AE=E5=88=AB?=
 =?UTF-8?q?=E4=B8=8D=E5=A4=A7=EF=BC=8C=E4=BD=86=E6=98=AF=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=E6=97=B6=E9=97=B4=E5=8F=98=E9=95=BF=E4=BA=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index c6648638..9236a52b 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -3152,16 +3152,16 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
         candidate_channel[c] = &candidate_block[c * 8 * 8];
     }
 
-//    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
     uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
 
     for (int c = 0; c < 3; c++)
     {
-//        if (mayout_channel[c].factor == 1) {
-  //          if (factor == 1) {
+        if (mayout_channel[c].factor == 1) {
+            if (factor == 1) {
                 const coeff_t *coeff_block = candidate_channel[c];
                 CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
-   /*         }
+            }
             else {
                 for (int iy = 0; iy < factor; ++iy) {
                     for (int ix = 0; ix < factor; ++ix) {
@@ -3183,8 +3183,7 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                     }
                 }
             }
-*/
- /*       }
+        }
         else { 
             if (factor == 1) {
                 int block_xx = block_x / mayout_channel[c].factor;
@@ -3211,10 +3210,9 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                     image_height);
             }
         }
-*/
     }
 
-  //  if (factor == 1)
+    if (factor == 1)
     {
         float rgb0_c[3][kDCTBlockSize];
         int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, 0, 0);
@@ -3227,7 +3225,6 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
 
         return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
     }
-/*
     else
     {
         int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16;
@@ -3259,6 +3256,5 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
         }
         return max_err;
     }
-*/
 }
 

From 61fde3c0b89f204cf7f0a7bcb0b3869edc414f76 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 13:11:16 +0800
Subject: [PATCH 141/189] =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=E5=92=8CTest=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 compile.bat | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compile.bat b/compile.bat
index 4d462695..c7cd2cc7 100644
--- a/compile.bat
+++ b/compile.bat
@@ -4,4 +4,9 @@ call vcvars64.bat
 @echo %1 --machine 64 or 32
 @echo %2  -G 
 
-nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %1 %2 -ptx -o clguetzli\clguetzli.cu.ptx%1  clguetzli\clguetzli.cu
\ No newline at end of file
+set machine_num=%1%
+set debug_opt=%2%
+
+if "%machine_num%" == "" set machine_num=64
+
+nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num%  clguetzli\clguetzli.cu
\ No newline at end of file

From 8fe84545a633c06a2cebbf952021646aad1225f7 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 15:07:09 +0800
Subject: [PATCH 142/189] =?UTF-8?q?=E5=87=8F=E5=B0=91kernel=E4=B8=AD?=
 =?UTF-8?q?=E4=B8=80=E4=BA=9B=E5=86=97=E4=BD=99=E7=9A=84=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?copy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 181 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 166 insertions(+), 15 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 9236a52b..2509ffa2 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -47,15 +47,33 @@ __device__ void Butteraugli8x8CornerEdgeDetectorDiff(
 
 __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_block[3*8*8], IntFloatPairList *input_order);
 
-__device__ double CompareBlockFactor(const channel_info mayout_channel[3],
+__device__ double Factor2(const channel_info mayout_channel[3],
                         const coeff_t* candidate_block, 
                         const int block_x, 
                         const int block_y, 
                         __global const float *orig_image_batch,
                         __global const float *mask_scale,
                         const int image_width,
-                        const int image_height,
-                        const int factor);
+                        const int image_height);
+
+__device__ double CompareBlockFactor1(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height);
+
+__device__ double CompareBlockFactor(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height,
+    const int factor);
 
 __device__ void floatcopy(float *dst, const float *src, int size);
 __device__ void coeffcopy(coeff_t *dst, const coeff_t *src, int size);
@@ -782,40 +800,37 @@ __kernel void clComputeBlockZeroingOrderEx(
     IntFloatPairList output_order = { 0, output_order_data };
 
     int count = MakeInputOrderEx(mayout_block, orig_block, &input_order);
-
-    coeff_t processed_block[kComputeBlockSize];
-    coeffcopy(processed_block, mayout_block, kComputeBlockSize);
-
+    
     while (input_order.size > 0)
     {
         float best_err = 1e17f;
         int best_i = 0;
         for (int i = 0; i < min(3, input_order.size); i++)
         {
-            coeff_t candidate_block[kComputeBlockSize];
-            coeffcopy(candidate_block, processed_block, kComputeBlockSize);
-
             const int idx = input_order.pData[i].idx;
-            candidate_block[idx] = 0;
+            coeff_t old_coeff = mayout_block[idx];
+            mayout_block[idx] = 0;
+
 
             float max_err = CompareBlockFactor(mayout_channel,
-                                               candidate_block,
+                                               mayout_block,
                                                block_x,
                                                block_y,
                                                orig_image_batch,
                                                mask_scale,
-                                               image_width, 
-                                               image_height, 
+                                               image_width,
+                                               image_height,
                                                factor);
             if (max_err < best_err)
             {
                 best_err = max_err;
                 best_i = i;
             }
+            mayout_block[idx] = old_coeff;
         }
 
         int idx = input_order.pData[best_i].idx;
-        processed_block[idx] = 0;
+        mayout_block[idx] = 0;
         list_erase(&input_order, best_i);
 
         list_push_back(&output_order, idx, best_err);
@@ -3137,6 +3152,142 @@ __device__ int GetOrigBlock(float rgb0_c[3][kDCTBlockSize],
     return block_ix;
 }
 
+__device__ double CompareBlockFactor1(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height)
+{
+    const coeff_t *candidate_channel[3];
+    for (int c = 0; c < 3; c++) {
+        candidate_channel[c] = &candidate_block[c * 8 * 8];
+    }
+
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
+
+    for (int c = 0; c < 3; c++)
+    {
+        if (mayout_channel[c].factor == 1) {
+            const coeff_t *coeff_block = candidate_channel[c];
+            CoeffToYUV8x8(coeff_block, &yuv8x8[c]);
+        }
+        else {
+            int block_xx = block_x / mayout_channel[c].factor;
+            int block_yy = block_y / mayout_channel[c].factor;
+            int ix = block_x % mayout_channel[c].factor;;
+            int iy = block_y % mayout_channel[c].factor;
+
+            int block_16x16idx = block_yy * mayout_channel[c].block_width + block_xx;
+            __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_16x16idx * 8 * 8;
+
+            CoeffToYUV16x16_g(coeff_block, &yuv16x16[c],
+                mayout_channel[c].pixel, block_xx, block_yy,
+                image_width,
+                image_height);
+
+            // copy YUV16x16 corner to YUV8x8
+            Copy16x16To8x8(&yuv16x16[c], &yuv8x8[c], ix, iy);
+        }
+    }
+
+    {
+        float rgb0_c[3][kDCTBlockSize];
+        int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, 1, 0, 0);
+
+        int inside_x = block_x * 8 + 8 > image_width ? image_width - block_x * 8 : 8;
+        int inside_y = block_y * 8 + 8 > image_height ? image_height - block_y * 8 : 8;
+        float rgb1_c[3][kDCTBlockSize];
+
+        YUVToImage(yuv8x8, rgb1_c[0], rgb1_c[1], rgb1_c[2], 8, 8, inside_x, inside_y);
+
+        return ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+    }
+}
+
+__device__ double Factor2(const channel_info mayout_channel[3],
+    const coeff_t* candidate_block,
+    const int block_x,
+    const int block_y,
+    __global const float *orig_image_batch,
+    __global const float *mask_scale,
+    const int image_width,
+    const int image_height)
+{
+    const int factor = 2;
+    const coeff_t *candidate_channel[3];
+    for (int c = 0; c < 3; c++) {
+        candidate_channel[c] = &candidate_block[c * 8 * 8];
+    }
+
+    uchar yuv16x16[3 * 16 * 16] = { 0 };  // factor 2 mode output image
+    uchar yuv8x8[3 * 8 * 8] = { 0 };      // factor 1 mode output image
+
+    for (int c = 0; c < 3; c++)
+    {
+        if (mayout_channel[c].factor == 1) {
+                for (int iy = 0; iy < factor; ++iy) {
+                    for (int ix = 0; ix < factor; ++ix) {
+                        int block_xx = block_x * factor + ix;
+                        int block_yy = block_y * factor + iy;
+
+                        ///if (ix != off_x || iy != off_y) continue;
+                        if (block_xx >= mayout_channel[c].block_width ||
+                            block_yy >= mayout_channel[c].block_height)
+                        {
+                            continue;
+                        }
+                        int block_8x8idx = block_yy * mayout_channel[c].block_width + block_xx;
+                        __global const coeff_t * coeff_block = mayout_channel[c].coeff + block_8x8idx * 8 * 8;
+                        CoeffToYUV8x8_g(coeff_block, &yuv8x8[c]);
+
+                        // copy YUV8x8 to YUV1616 corner
+                        Copy8x8To16x16(&yuv8x8[c], &yuv16x16[c], ix, iy);
+                    }
+                }
+        }
+        else {
+                const coeff_t * coeff_block = candidate_channel[c];
+                CoeffToYUV16x16(coeff_block, &yuv16x16[c],
+                    mayout_channel[c].pixel, block_x, block_y,
+                    image_width,
+                    image_height);
+        }
+    }
+
+        int inside_x = block_x * 16 + 16 > image_width ? image_width - block_x * 16 : 16;
+        int inside_y = block_y * 16 + 16 > image_height ? image_height - block_y * 16 : 16;
+
+        float rgb16x16[3][16 * 16];
+        YUVToImage(yuv16x16, rgb16x16[0], rgb16x16[1], rgb16x16[2], 16, 16, inside_x, inside_y);
+
+        double max_err = 0;
+        for (int iy = 0; iy < factor; ++iy) {
+            for (int ix = 0; ix < factor; ++ix) {
+                int block_xx = block_x * factor + ix;
+                int block_yy = block_y * factor + iy;
+
+                if (block_xx * 8 >= image_width ||
+                    block_yy * 8 >= image_height)
+                {
+                    continue;
+                }
+
+                float rgb0_c[3][kDCTBlockSize];
+                int block_8x8idx = GetOrigBlock(rgb0_c, orig_image_batch, image_width, image_height, block_x, block_y, factor, ix, iy);
+
+                float rgb1_c[3][kDCTBlockSize];
+                Copy16x16ToChannel(rgb16x16, rgb1_c[0], rgb1_c[1], rgb1_c[2], ix, iy);
+                double err = ComputeImage8x8Block(rgb0_c, rgb1_c, mask_scale + block_8x8idx * 3);
+                max_err = max(max_err, err);
+            }
+        }
+        return max_err;
+}
+
 __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
                           const coeff_t* candidate_block, 
                           const int block_x, 

From c90b88a86791038ff9c15eef4274e3beeddb6e67 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 15:08:56 +0800
Subject: [PATCH 143/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

---
 .gitignore               |   1 +
 clguetzli/clguetzli.cl   |   9 ++-
 clguetzli/clguetzli.cl.h |   2 +-
 clguetzli/clguetzli.cu   |   2 +-
 clguetzli/ocu.cpp        |   2 +-
 compile.sh               |  12 +++
 guetzli.make             | 170 +++++++--------------------------------
 guetzli_static.make      | 157 ++++++------------------------------
 premake5.lua             |   3 +-
 9 files changed, 75 insertions(+), 283 deletions(-)
 create mode 100644 compile.sh

diff --git a/.gitignore b/.gitignore
index 3d270281..0cc93f06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ ipch/
 *.VC.db
 *.VC.VC.opendb
 guetzli.vcxproj.user
+clguetzli/clguetzli.cu.ptx*
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 2509ffa2..7bacec40 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1998,6 +1998,11 @@ __constant static float bias[192] = {
 	0.0
 };
 
+__device__ coeff_t _abs(coeff_t val)
+{
+	return val >= 0 ? val : -val;
+}
+
 // chrisk todo
 // return the count of Non-zero item
 __device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
@@ -2007,7 +2012,7 @@ __device__ int MakeInputOrder(__global const coeff_t *block, __global const coef
 		for (int k = 1; k < block_size; ++k) {
 			int idx = c * block_size + k;
 			if (block[idx] != 0) {
-				float score = abs(orig_block[idx]) * csf[idx] + bias[idx];
+				float score = _abs(orig_block[idx]) * csf[idx] + bias[idx];
 				size = list_push_back(input_order, idx, score);
 			}
 		}
@@ -3118,7 +3123,7 @@ __device__ int MakeInputOrderEx(const coeff_t block[3*8*8], const coeff_t orig_b
         for (int k = 1; k < block_size; ++k) {
             int idx = c * block_size + k;
             if (block[idx] != 0) {
-                float score = abs(orig_block[idx]) * csf[idx] + bias[idx];
+                float score = _abs(orig_block[idx]) * csf[idx] + bias[idx];
                 size = list_push_back(input_order, idx, score);
             }
         }
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 2a8ed044..288db67c 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -3,7 +3,7 @@
 
 #ifdef __cplusplus
 #ifndef __CUDACC__
-#include "CL\cl.h"
+#include "CL/cl.h"
 #include "cuda.h"
 #endif
 #endif
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index dbca9906..351bed47 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1,4 +1,4 @@
-#include "clguetzli\clguetzli.cl"
+#include "clguetzli/clguetzli.cl"
 /*
 __device__ int get_global_id(int dim)
 {
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 3a1b695f..48f2768a 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -199,7 +199,7 @@ const char* TranslateCUDAError(CUresult errorCode)
     case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
     case CUDA_ERROR_INVALID_PTX: return "CUDA_ERROR_INVALID_PTX";
     case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
-    case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
+    // case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
     case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
     case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
     case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
diff --git a/compile.sh b/compile.sh
new file mode 100644
index 00000000..9aa628bc
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+#Compile .cu file
+echo $1 --machine 64 or 32
+echo $2 -G
+
+nvcc -I"./" -I"/usr/local/cuda/include" -arch=compute_30 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
+
+#copy to ./bin/Release
+cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1
+cp clguetzli/clguetzli.cl bin/Release/clguetzli/clguetzli.cl
+cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h
\ No newline at end of file
diff --git a/guetzli.make b/guetzli.make
index 442d678b..3675ba0d 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -15,14 +15,14 @@ ifeq ($(config),release)
   TARGETDIR = bin/Release
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
-  DEFINES +=
+  DEFINES += -D__USE_CUDA__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL
+  LIBS += -lOpenCL -lcuda
   LDDEPS +=
   ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
@@ -32,7 +32,7 @@ ifeq ($(config),release)
   endef
   define POSTBUILDCMDS
   endef
-all: prebuild prelink $(TARGET)
+all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
 	@:
 
 endif
@@ -42,14 +42,14 @@ ifeq ($(config),debug)
   TARGETDIR = bin/Debug
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Debug/guetzli
-  DEFINES +=
+  DEFINES += -D__USE_CUDA__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL
+  LIBS += -lOpenCL -lcuda
   LDDEPS +=
   ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
@@ -59,7 +59,7 @@ ifeq ($(config),debug)
   endef
   define POSTBUILDCMDS
   endef
-all: prebuild prelink $(TARGET)
+all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
 	@:
 
 endif
@@ -69,7 +69,9 @@ OBJECTS := \
 	$(OBJDIR)/clguetzli.cl.o \
 	$(OBJDIR)/clguetzli.o \
 	$(OBJDIR)/clguetzli_test.o \
+	$(OBJDIR)/cuguetzli.o \
 	$(OBJDIR)/ocl.o \
+	$(OBJDIR)/ocu.o \
 	$(OBJDIR)/utils.o \
 	$(OBJDIR)/butteraugli_comparator.o \
 	$(OBJDIR)/dct_double.o \
@@ -107,13 +109,24 @@ endif
 
 $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES)
 	@echo Linking guetzli
+	$(SILENT) $(LINKCMD)
+	$(POSTBUILDCMDS)
+
+$(TARGETDIR):
+	@echo Creating $(TARGETDIR)
 ifeq (posix,$(SHELLTYPE))
 	$(SILENT) mkdir -p $(TARGETDIR)
 else
 	$(SILENT) mkdir $(subst /,\\,$(TARGETDIR))
 endif
-	$(SILENT) $(LINKCMD)
-	$(POSTBUILDCMDS)
+
+$(OBJDIR):
+	@echo Creating $(OBJDIR)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 
 clean:
 	@echo Cleaning guetzli
@@ -140,219 +153,90 @@ endif
 
 $(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp
+	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/ocl.o: clguetzli/ocl.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocu.o: clguetzli/ocu.cpp
+	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/utils.o: clguetzli/utils.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/dct_double.o: guetzli/dct_double.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/debug_print.o: guetzli/debug_print.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/fdct.o: guetzli/fdct.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/guetzli.o: guetzli/guetzli.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/idct.o: guetzli/idct.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/output_image.o: guetzli/output_image.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/processor.o: guetzli/processor.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quality.o: guetzli/quality.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quantize.o: guetzli/quantize.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/score.o: guetzli/score.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 
 -include $(OBJECTS:%.o=%.d)
diff --git a/guetzli_static.make b/guetzli_static.make
index f271c46f..68808523 100644
--- a/guetzli_static.make
+++ b/guetzli_static.make
@@ -32,7 +32,7 @@ ifeq ($(config),release)
   endef
   define POSTBUILDCMDS
   endef
-all: prebuild prelink $(TARGET)
+all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
 	@:
 
 endif
@@ -59,7 +59,7 @@ ifeq ($(config),debug)
   endef
   define POSTBUILDCMDS
   endef
-all: prebuild prelink $(TARGET)
+all: $(TARGETDIR) $(OBJDIR) prebuild prelink $(TARGET)
 	@:
 
 endif
@@ -69,7 +69,9 @@ OBJECTS := \
 	$(OBJDIR)/clguetzli.cl.o \
 	$(OBJDIR)/clguetzli.o \
 	$(OBJDIR)/clguetzli_test.o \
+	$(OBJDIR)/cuguetzli.o \
 	$(OBJDIR)/ocl.o \
+	$(OBJDIR)/ocu.o \
 	$(OBJDIR)/utils.o \
 	$(OBJDIR)/butteraugli_comparator.o \
 	$(OBJDIR)/dct_double.o \
@@ -106,13 +108,24 @@ endif
 
 $(TARGET): $(GCH) ${CUSTOMFILES} $(OBJECTS) $(LDDEPS) $(RESOURCES)
 	@echo Linking guetzli_static
+	$(SILENT) $(LINKCMD)
+	$(POSTBUILDCMDS)
+
+$(TARGETDIR):
+	@echo Creating $(TARGETDIR)
 ifeq (posix,$(SHELLTYPE))
 	$(SILENT) mkdir -p $(TARGETDIR)
 else
 	$(SILENT) mkdir $(subst /,\\,$(TARGETDIR))
 endif
-	$(SILENT) $(LINKCMD)
-	$(POSTBUILDCMDS)
+
+$(OBJDIR):
+	@echo Creating $(OBJDIR)
+ifeq (posix,$(SHELLTYPE))
+	$(SILENT) mkdir -p $(OBJDIR)
+else
+	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
+endif
 
 clean:
 	@echo Cleaning guetzli_static
@@ -139,211 +152,87 @@ endif
 
 $(OBJDIR)/clbutter_comparator.o: clguetzli/clbutter_comparator.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/clguetzli.cl.o: clguetzli/clguetzli.cl.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/clguetzli.o: clguetzli/clguetzli.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp
+	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/ocl.o: clguetzli/ocl.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/ocu.o: clguetzli/ocu.cpp
+	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/utils.o: clguetzli/utils.cpp
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli_comparator.o: guetzli/butteraugli_comparator.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/dct_double.o: guetzli/dct_double.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/debug_print.o: guetzli/debug_print.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/entropy_encode.o: guetzli/entropy_encode.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/fdct.o: guetzli/fdct.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/gamma_correct.o: guetzli/gamma_correct.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/idct.o: guetzli/idct.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data.o: guetzli/jpeg_data.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_decoder.o: guetzli/jpeg_data_decoder.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_encoder.o: guetzli/jpeg_data_encoder.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_reader.o: guetzli/jpeg_data_reader.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_data_writer.o: guetzli/jpeg_data_writer.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/jpeg_huffman_decode.o: guetzli/jpeg_huffman_decode.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/output_image.o: guetzli/output_image.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/preprocess_downsample.o: guetzli/preprocess_downsample.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/processor.o: guetzli/processor.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quality.o: guetzli/quality.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/quantize.o: guetzli/quantize.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/score.o: guetzli/score.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/butteraugli.o: third_party/butteraugli/butteraugli/butteraugli.cc
 	@echo $(notdir $<)
-ifeq (posix,$(SHELLTYPE))
-	$(SILENT) mkdir -p $(OBJDIR)
-else
-	$(SILENT) mkdir $(subst /,\\,$(OBJDIR))
-endif
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 
 -include $(OBJECTS:%.o=%.d)
diff --git a/premake5.lua b/premake5.lua
index 18f5ecee..f6723df8 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -42,9 +42,10 @@ workspace "guetzli"
   project "guetzli"
     kind "ConsoleApp"
     filter "action:gmake"
+	  defines { "__USE_CUDA__" }
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
-      links { "OpenCL" }
+      links { "OpenCL", "cuda" }
     filter "action:vs*"
       links { "shlwapi" }
     filter {}

From 1e4b4f4325e1c4be44c9b7501cb1bcd05c6c8322 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 18:25:26 +0800
Subject: [PATCH 144/189] =?UTF-8?q?=E4=BC=98=E5=8C=96clDiffmapOpsinDynamic?=
 =?UTF-8?q?sImageEx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp                       | 69 ++++++++++--------
 clguetzli/clguetzli.h                         |  8 +++
 clguetzli/cuguetzli.cpp                       | 70 +++++++++++--------
 clguetzli/cuguetzli.h                         |  8 +++
 guetzli/butteraugli_comparator.cc             | 60 +++++++++++++++-
 guetzli/guetzli.cc                            | 12 ++--
 .../butteraugli/butteraugli/butteraugli.h     |  1 +
 7 files changed, 166 insertions(+), 62 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 2bee09be..0d52eb33 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -30,11 +30,7 @@ void clDiffmapOpsinDynamicsImage(
     const size_t xsize, const size_t ysize,
     const size_t step)
 {
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
     size_t channel_size = xsize * ysize * sizeof(float);
-    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
 
     ocl_args_d_t &ocl = getOcl();
     ocl_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
@@ -42,26 +38,7 @@ void clDiffmapOpsinDynamicsImage(
 
     cl_mem mem_result = ocl.allocMem(channel_size, result);
 
-    cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
-    cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
-    cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
-
-    clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
-
-    clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
-    clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    {
-        ocl_channels mask = ocl.allocMemChannels(channel_size);
-        ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
-        clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
-        clCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
-
-        ocl.releaseMemChannels(mask);
-        ocl.releaseMemChannels(mask_dc);
-    }
-
-    clCalculateDiffmapEx(mem_result, xsize, ysize, step);
+    clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step);
 
     clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, result, 0, NULL, NULL);
     cl_int err = clFinish(ocl.commandQueue);
@@ -69,10 +46,6 @@ void clDiffmapOpsinDynamicsImage(
     ocl.releaseMemChannels(xyb1);
     ocl.releaseMemChannels(xyb0);
 
-    clReleaseMemObject(edge_detector_map);
-    clReleaseMemObject(block_diff_dc);
-    clReleaseMemObject(block_diff_ac);
-
     clReleaseMemObject(mem_result);
 }
 
@@ -182,6 +155,46 @@ void clMask(
     ocl.releaseMemChannels(mask_dc);
 }
 
+void clDiffmapOpsinDynamicsImageEx(
+    cl_mem result,
+    ocl_channels xyb0,
+    ocl_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    ocl_args_d_t &ocl = getOcl();
+ 
+    cl_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    cl_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    cl_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
+
+    clMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    clEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    clBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    clEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocl_channels mask = ocl.allocMemChannels(channel_size);
+        ocl_channels mask_dc = ocl.allocMemChannels(channel_size);
+        clMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        clCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
+
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
+
+    clCalculateDiffmapEx(result, xsize, ysize, step);
+
+    clReleaseMemObject(edge_detector_map);
+    clReleaseMemObject(block_diff_dc);
+    clReleaseMemObject(block_diff_ac);
+}
 void clConvolutionEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index cad4ef6e..9418b38d 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -48,6 +48,14 @@ void clMask(
     const float* r,  const float* g,  const float* b,
     const float* r2, const float* g2, const float* b2);
 
+void clDiffmapOpsinDynamicsImageEx(
+    cl_mem result,
+    ocl_channels xyb0,
+    ocl_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
+
 void clConvolutionEx(
     cl_mem result/*out*/,
     const cl_mem inp, size_t xsize, size_t ysize,
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index b73967d9..97fb2d10 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -34,11 +34,7 @@ void cuDiffmapOpsinDynamicsImage(
     const size_t xsize, const size_t ysize,
     const size_t step)
 {
-    const size_t res_xsize = (xsize + step - 1) / step;
-    const size_t res_ysize = (ysize + step - 1) / step;
-
     size_t channel_size = xsize * ysize * sizeof(float);
-    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
 
     ocu_args_d_t &ocl = getOcu();
     ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
@@ -46,36 +42,13 @@ void cuDiffmapOpsinDynamicsImage(
 
     cu_mem mem_result = ocl.allocMem(channel_size, result);
 
-    cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
-    cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
-    cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
-
-    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
-
-    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
-    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
-    {
-        ocu_channels mask = ocl.allocMemChannels(channel_size);
-        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
-        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
-        cuCombineChannelsEx(mem_result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
-
-        ocl.releaseMemChannels(mask);
-        ocl.releaseMemChannels(mask_dc);
-    }
-
-    cuCalculateDiffmapEx(mem_result, xsize, ysize, step);
+    cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step);
 
     cuMemcpyDtoH(result, mem_result, channel_size);
 
     ocl.releaseMemChannels(xyb1);
     ocl.releaseMemChannels(xyb0);
 
-    cuMemFree(edge_detector_map);
-    cuMemFree(block_diff_dc);
-    cuMemFree(block_diff_ac);
-
     cuMemFree(mem_result);
 }
 
@@ -188,6 +161,47 @@ void cuMask(
     ocl.releaseMemChannels(mask_dc);
 }
 
+void cuDiffmapOpsinDynamicsImageEx(
+    cu_mem result,
+    ocu_channels xyb0,
+    ocu_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step)
+{
+    const size_t res_xsize = (xsize + step - 1) / step;
+    const size_t res_ysize = (ysize + step - 1) / step;
+
+    size_t channel_size = xsize * ysize * sizeof(float);
+    size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
+
+    ocu_args_d_t &ocl = getOcu();
+ 
+    cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
+    cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
+    cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
+
+    cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
+
+    cuEdgeDetectorMapEx(edge_detector_map, xyb0, xyb1, xsize, ysize, step);
+    cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
+    {
+        ocu_channels mask = ocl.allocMemChannels(channel_size);
+        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+        cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
+        cuCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
+
+        ocl.releaseMemChannels(mask);
+        ocl.releaseMemChannels(mask_dc);
+    }
+
+    cuCalculateDiffmapEx(result, xsize, ysize, step);
+
+    cuMemFree(edge_detector_map);
+    cuMemFree(block_diff_dc);
+    cuMemFree(block_diff_ac);
+}
+
 void cuConvolutionEx(
     cu_mem result/*out*/,
     const cu_mem inp, size_t xsize, size_t ysize,
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
index 81ec377b..5082ea1c 100644
--- a/clguetzli/cuguetzli.h
+++ b/clguetzli/cuguetzli.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "guetzli/processor.h"
 #include "clguetzli.cl.h"
+#include "ocu.h"
 
 #ifdef __USE_CUDA__
 
@@ -34,6 +35,13 @@ void cuMask(
     const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2);
 
+void cuDiffmapOpsinDynamicsImageEx(
+    cu_mem result,
+    ocu_channels xyb0,
+    ocu_channels xyb1,
+    const size_t xsize, const size_t ysize,
+    const size_t step);
+
 void cuConvolutionXEx(
     cu_mem result/*out*/,
     const cu_mem inp, size_t xsize, size_t ysize,
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index 02256e95..1c6342c8 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -22,6 +22,10 @@
 #include "guetzli/gamma_correct.h"
 #include "guetzli/score.h"
 
+#include "clguetzli\ocu.h"
+#include "clguetzli\clguetzli.h"
+#include "clguetzli\cuguetzli.h"
+
 namespace guetzli {
 
 ButteraugliComparator::ButteraugliComparator(const int width, const int height,
@@ -51,8 +55,60 @@ ButteraugliComparator::ButteraugliComparator(const int width, const int height,
 void ButteraugliComparator::Compare(const OutputImage& img) {
   std::vector<std::vector<float> > rgb(3, std::vector<float>(width_ * height_));
   img.ToLinearRGB(&rgb);
-  ::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
-  comparator_.DiffmapOpsinDynamicsImage(rgb_linear_pregamma_, rgb, distmap_);
+
+  if (MODE_OPENCL == g_mathMode)
+  {
+      const int xsize = width_;
+      const int ysize = height_;
+      distmap_.resize(xsize * ysize);
+
+      size_t channel_size = xsize * ysize * sizeof(float);
+      ocl_args_d_t &ocl = getOcl();
+      ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data());
+      ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_linear_pregamma_[0].data(), rgb_linear_pregamma_[1].data(), rgb_linear_pregamma_[2].data());
+
+      cl_mem mem_result = ocl.allocMem(channel_size, distmap_.data());
+
+      clOpsinDynamicsImageEx(xyb1, xsize, ysize);
+      clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_);
+
+      clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL);
+      clFinish(ocl.commandQueue);
+
+      clReleaseMemObject(mem_result);
+      ocl.releaseMemChannels(xyb0);
+      ocl.releaseMemChannels(xyb1);
+  }
+#ifdef __HAVE_CUDA__
+  else if (MODE_CUDA == g_mathMode)
+  {
+      const int xsize = width_;
+      const int ysize = height_;
+
+      size_t channel_size = xsize * ysize * sizeof(float);
+      ocu_args_d_t &ocl = getOcu();
+      ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data());
+      ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_linear_pregamma_[0].data(), rgb_linear_pregamma_[1].data(), rgb_linear_pregamma_[2].data());
+
+      cu_mem mem_result = ocl.allocMem(channel_size, distmap_.data());
+
+      cuOpsinDynamicsImageEx(xyb1, xsize, ysize);
+
+      cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_);
+
+      cuMemcpyDtoH(distmap_.data(), mem_result, channel_size);
+
+      cuMemFree(mem_result);
+      ocl.releaseMemChannels(xyb0);
+      ocl.releaseMemChannels(xyb1);
+  } 
+#endif
+  else
+    {
+        ::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
+        comparator_.DiffmapOpsinDynamicsImage(rgb_linear_pregamma_, rgb, distmap_);
+    }
+
   distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
   GUETZLI_LOG(stats_, " BA[100.00%%] D[%6.4f]", distance_);
 }
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 276cb9d6..9e1be556 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -226,9 +226,11 @@ void Usage() {
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
 	  "  --opencl     - Use OpenCL\n"
-	  "  --cuda       - Use CUDA\n"	 
       "  --checkcl    - Check OpenCL result\n"
+#ifdef __USE_CUDA__
+	  "  --cuda       - Use CUDA\n"	 
       "  --checkcuda  - Check CUDA result\n"
+#endif
       "  --nomemlimit - Do not limit memory usage.\n", kDefaultJPEGQuality, kDefaultMemlimitMB);
   exit(1);
 }
@@ -264,15 +266,17 @@ int main(int argc, char** argv) {
 	else if (!strcmp(argv[opt_idx], "--opencl")) {
 		g_mathMode = MODE_OPENCL;
 	}
-	else if (!strcmp(argv[opt_idx], "--cuda")) {
-		g_mathMode = MODE_CUDA;
-	}
     else if (!strcmp(argv[opt_idx], "--checkcl")) {
         g_mathMode = MODE_CHECKCL;
     }
+#ifdef __USE_CUDA__
+	else if (!strcmp(argv[opt_idx], "--cuda")) {
+		g_mathMode = MODE_CUDA;
+	}
     else if (!strcmp(argv[opt_idx], "--checkcuda")) {
         g_mathMode = MODE_CHECKCUDA;
     }
+#endif
 	else if (!strcmp(argv[opt_idx], "--")) {
       opt_idx++;
       break;
diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h
index 637f50ff..16040e95 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.h
+++ b/third_party/butteraugli/butteraugli/butteraugli.h
@@ -72,6 +72,7 @@ class ButteraugliComparator {
                        const std::vector<float>& edge_detector_map,
                        std::vector<float>* result);
 
+public:
   const size_t xsize_;
   const size_t ysize_;
   const size_t num_pixels_;

From 39950066092cf7bfc940b7bb0fab0a168f843857 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 6 Jun 2017 22:02:13 +0800
Subject: [PATCH 145/189] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?=E8=B0=83=E8=AF=95=E4=BF=A1=E6=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 11 ++++++++---
 compile.bat            |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 7bacec40..bc0c4d48 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -3072,8 +3072,10 @@ __device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
 
 __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
 { 
-//    CalcOpsinDynamicsImage(rgb0_c);
-    CalcOpsinDynamicsImage(rgb1_c);
+//    return 0;       // 126ms 
+//    CalcOpsinDynamicsImage(rgb0_c);  -- calc in cpu one time
+    CalcOpsinDynamicsImage(rgb1_c);     
+//    return 0;       // 425ms
 
     float rgb0[3][kDCTBlockSize];
     float rgb1[3][kDCTBlockSize];
@@ -3086,7 +3088,7 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
                                 rgb0_c[0], rgb0_c[1], rgb0_c[2],
                                 rgb1_c[0], rgb1_c[1], rgb1_c[2],
                                 8, 8);
-
+//    return 0;       // 544ms
     // ����ΪɶҪ��floatת��double���ܼ��������㣿
     double b0[3 * kDCTBlockSize];       // 
     double b1[3 * kDCTBlockSize];
@@ -3102,6 +3104,7 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
     double diff_xyz_edge_dc[3] = { 0.0 };
     ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
 
+//    return 0;       // 735ms
     double diff = 0.0;
     double diff_edge = 0.0;
 
@@ -3112,6 +3115,8 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
     }
     const double kEdgeWeight = 0.05;
     return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+
+//   750ms
 }
 
 // return the count of Non-zero item
diff --git a/compile.bat b/compile.bat
index c7cd2cc7..3b44020d 100644
--- a/compile.bat
+++ b/compile.bat
@@ -9,4 +9,4 @@ set debug_opt=%2%
 
 if "%machine_num%" == "" set machine_num=64
 
-nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num%  clguetzli\clguetzli.cu
\ No newline at end of file
+nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 -lineinfo -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num%  clguetzli\clguetzli.cu
\ No newline at end of file

From 1aa86d55b97bb0a7ee5c67f4133d9891f2819420 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 7 Jun 2017 10:12:13 +0800
Subject: [PATCH 146/189] =?UTF-8?q?kernel=E8=BF=90=E7=AE=97=E7=94=A8float?=
 =?UTF-8?q?=E6=9B=BF=E4=BB=A3double=EF=BC=8C=E8=8A=82=E7=9C=81=E8=BF=90?=
 =?UTF-8?q?=E7=AE=97=E6=97=B6=E9=97=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl     | 7 +++++++
 clguetzli/clguetzli.cl.cpp | 6 ++++--
 clguetzli/clguetzli.cl.h   | 2 ++
 clguetzli/clguetzli.cpp    | 8 ++++++++
 clguetzli/clguetzli.h      | 8 ++++++++
 clguetzli/cuguetzli.cpp    | 8 ++++++++
 clguetzli/cuguetzli.h      | 8 ++++++++
 7 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index bc0c4d48..893b941f 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -2,6 +2,10 @@
 
 #include  "clguetzli/clguetzli.cl.h"
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
 #define kBlockEdge      8
 #define kBlockSize      (kBlockEdge * kBlockEdge)
 #define kDCTBlockSize   (kBlockEdge * kBlockEdge)
@@ -3419,3 +3423,6 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
     }
 }
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index a18cd110..6d1ae45b 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -49,9 +49,11 @@ namespace guetzli
         const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
         const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
         const int num_blocks = block_width * block_height;
-
+#ifdef __USE_DOUBLE_AS_FLOAT__
+        const float* lut = kSrgb8ToLinearTable;
+#else
         const double* lut = kSrgb8ToLinearTable;
-
+#endif
         imgOpsinDynamicsBlockList.resize(num_blocks * 3 * kDCTBlockSize);
         imgMaskXyzScaleBlockList.resize(num_blocks * 3);
         for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y)
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 288db67c..102f3ac9 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -8,6 +8,8 @@
 #endif
 #endif
 
+#define __USE_DOUBLE_AS_FLOAT__
+
 #ifdef __cplusplus
 #ifndef __CUDACC__
     #define __kernel
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 0d52eb33..24c939b5 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -4,6 +4,10 @@
 #include <vector>
 #include "cl.hpp"
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
 extern MATH_MODE g_mathMode = MODE_CPU;
 
 void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
@@ -820,3 +824,7 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 
 	clReleaseMemObject(blurred);
 }
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 9418b38d..7bebdd66 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -7,6 +7,10 @@
 
 #include "cuguetzli.h"
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
 enum MATH_MODE
 {
     MODE_CPU = 0,
@@ -146,6 +150,10 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 
 class guetzli::OutputImage;
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
 namespace guetzli {
 
     class ButteraugliComparatorEx : public ButteraugliComparator
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 97fb2d10..0918df75 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -4,6 +4,10 @@
 
 #ifdef __USE_CUDA__
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
 #define cuFinish cuStreamSynchronize
 #define BLOCK_SIZE_X 16
 #define BLOCK_SIZE_Y 16
@@ -885,4 +889,8 @@ void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const si
     cuMemFree(blurred);
 }
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
 #endif
\ No newline at end of file
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
index 5082ea1c..a75dcc46 100644
--- a/clguetzli/cuguetzli.h
+++ b/clguetzli/cuguetzli.h
@@ -5,6 +5,10 @@
 
 #ifdef __USE_CUDA__
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
 void cuOpsinDynamicsImage(
 	float *r, float *g, float *b, 
 	const size_t xsize, const size_t ysize);
@@ -124,4 +128,8 @@ void cuAddBorderEx(cu_mem out, const size_t xsize, const size_t ysize, const int
 
 void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const size_t ysize, const int step);
 
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#undef double
+#endif
+
 #endif
\ No newline at end of file

From 8ed0ce3e72ddef9b08eb3433c305fb559e2aca2e Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 7 Jun 2017 12:32:24 +0800
Subject: [PATCH 147/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E6=95=B0=E7=BB=84?=
 =?UTF-8?q?=E9=95=BF=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cpp | 2 +-
 clguetzli/cuguetzli.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 24c939b5..8f39fb46 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -659,7 +659,7 @@ void clDoMask(ocl_channels mask/*in, out*/, ocl_channels mask_dc/*in, out*/, siz
         MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
     }
 
-	size_t channel_size = 512 * 3 * sizeof(double);
+	size_t channel_size = 512 * sizeof(double);
 	ocl_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
     ocl_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
 
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 0918df75..3b8c2835 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -709,7 +709,7 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz
         MakeMask(extmul, extoff, mul, offset, scaler, lut_dcb);
     }
 
-    size_t channel_size = 512 * 3 * sizeof(double);
+    size_t channel_size = 512 * sizeof(double);
     ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
     ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
 

From 7aff1646bdb052206bb139ad01d40b62737ceba9 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 7 Jun 2017 17:08:29 +0800
Subject: [PATCH 148/189] =?UTF-8?q?=E6=88=91=E4=B9=9F=E4=B8=8D=E7=9F=A5?=
 =?UTF-8?q?=E9=81=93=E4=B8=BA=E4=BB=80=E4=B9=88=EF=BC=8C=E5=88=A0=E9=99=A4?=
 =?UTF-8?q?=E6=8E=89=E8=BF=99=E4=B8=AA=E7=A9=BA=E8=A1=8C=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E7=BB=93=E6=9E=9C=E5=B0=B1=E6=AD=A3=E7=A1=AE=E4=BA=86=20?=
 =?UTF-8?q?=E8=82=AF=E5=AE=9A=E6=98=AFopencl=E7=94=9F=E6=88=90=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E6=97=B6=E6=9C=89bug!?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 893b941f..a947f770 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -641,7 +641,6 @@ __kernel void clDoMaskEx(
     mask_dc_x[idx] = (float)(InterpolateClampNegative(lut_dc_x, 512, p0));
     mask_dc_y[idx] = (float)(InterpolateClampNegative(lut_dc_y, 512, p1));
     mask_dc_b[idx] = (float)(InterpolateClampNegative(lut_dc_b, 512, p2));
-
 }
 
 __kernel void clCombineChannelsEx(

From f795ad1e99cd9f199f662a10325c6679379268a5 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 7 Jun 2017 18:26:39 +0800
Subject: [PATCH 149/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index e6070b25..e6e2be45 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -144,7 +144,7 @@
       <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -408,14 +408,14 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">clguetzli\clguetzli.cu.ptx64</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ProjectDir)compile.bat 64 -G</Command>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CUDA Code Builder</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cu.ptx</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">clguetzli\clguetzli.cu.ptx64</Outputs>
       <LinkObjects Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</LinkObjects>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ProjectDir)compile.bat 32 -G</Command>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CUDA Code Builder</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cu.ptx</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clguetzli\clguetzli.cu.ptx32</Outputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CUDA Code Builder</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cu.ptx</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clguetzli\clguetzli.cu.ptx32</Outputs>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
     </CustomBuild>

From 13abc16a44abb1e45f029474e82df16df5c49d68 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 7 Jun 2017 18:49:55 +0800
Subject: [PATCH 150/189] =?UTF-8?q?=E4=BF=AE=E6=AD=A3warning?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index a947f770..6ddc81c5 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -271,9 +271,9 @@ __kernel void clMaskHighIntensityChangeEx(
 
     size_t ix = y * xsize + x;
     const double ave[3] = {
-        (c0_x[ix] + c1_x[ix]) * 0.5,
-        (c0_y[ix] + c1_y[ix]) * 0.5,
-        (c0_b[ix] + c1_b[ix]) * 0.5,
+        (c0_x[ix] + c1_x[ix]) * 0.5f,
+        (c0_y[ix] + c1_y[ix]) * 0.5f,
+        (c0_b[ix] + c1_b[ix]) * 0.5f,
     };
     double sqr_max_diff = -1;
     {
@@ -2992,9 +2992,9 @@ __device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float
         {
             size_t ix = y * xsize + x;
             const double ave[3] = {
-                (c0_x[ix] + c1_x[ix]) * 0.5,
-                (c0_y[ix] + c1_y[ix]) * 0.5,
-                (c0_b[ix] + c1_b[ix]) * 0.5,
+                (c0_x[ix] + c1_x[ix]) * 0.5f,
+                (c0_y[ix] + c1_y[ix]) * 0.5f,
+                (c0_b[ix] + c1_b[ix]) * 0.5f,
             };
             double sqr_max_diff = -1;
             {

From 0c85b8fe63ce86e1935f936bcff06bf6f9b1febd Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 7 Jun 2017 18:50:10 +0800
Subject: [PATCH 151/189] =?UTF-8?q?=E6=8D=A2=E4=B8=80=E7=BB=84=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 compile.bat | 2 +-
 compile.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compile.bat b/compile.bat
index 3b44020d..1b98c758 100644
--- a/compile.bat
+++ b/compile.bat
@@ -9,4 +9,4 @@ set debug_opt=%2%
 
 if "%machine_num%" == "" set machine_num=64
 
-nvcc -Xcompiler "/wd 4819" -I"./" -arch=compute_30 -lineinfo -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num%  clguetzli\clguetzli.cu
\ No newline at end of file
+nvcc -Xcompiler "/wd 4819" -I"./" -use_fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine %machine_num% %debug_opt% -ptx -o clguetzli\clguetzli.cu.ptx%machine_num%  clguetzli\clguetzli.cu
\ No newline at end of file
diff --git a/compile.sh b/compile.sh
index 9aa628bc..40cc3db3 100644
--- a/compile.sh
+++ b/compile.sh
@@ -4,7 +4,7 @@
 echo $1 --machine 64 or 32
 echo $2 -G
 
-nvcc -I"./" -I"/usr/local/cuda/include" -arch=compute_30 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
+nvcc -I"./" -I"/usr/local/cuda/include" -use-fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
 
 #copy to ./bin/Release
 cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1

From 12cd1200b09f1911f2d0dfbbcfd9fcb9a1405220 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Wed, 7 Jun 2017 19:55:41 +0800
Subject: [PATCH 152/189] fix linux build

---
 compile.sh                        | 4 ++--
 guetzli/butteraugli_comparator.cc | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/compile.sh b/compile.sh
index 40cc3db3..0b13d464 100644
--- a/compile.sh
+++ b/compile.sh
@@ -4,9 +4,9 @@
 echo $1 --machine 64 or 32
 echo $2 -G
 
-nvcc -I"./" -I"/usr/local/cuda/include" -use-fast_math -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
+nvcc -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
 
 #copy to ./bin/Release
 cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1
 cp clguetzli/clguetzli.cl bin/Release/clguetzli/clguetzli.cl
-cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h
\ No newline at end of file
+cp clguetzli/clguetzli.cl.h bin/Release/clguetzli/clguetzli.cl.h
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
index b3353044..f0ce5eb4 100644
--- a/guetzli/butteraugli_comparator.cc
+++ b/guetzli/butteraugli_comparator.cc
@@ -22,9 +22,9 @@
 #include "guetzli/gamma_correct.h"
 #include "guetzli/score.h"
 
-#include "clguetzli\ocu.h"
-#include "clguetzli\clguetzli.h"
-#include "clguetzli\cuguetzli.h"
+#include "clguetzli/ocu.h"
+#include "clguetzli/clguetzli.h"
+#include "clguetzli/cuguetzli.h"
 
 namespace guetzli {
 

From 7c2e57d6f228a6ffeae7cff92e7be424fd487d45 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 8 Jun 2017 01:14:18 +0800
Subject: [PATCH 153/189] =?UTF-8?q?merge=20google=E7=9A=84=E6=94=B9?=
 =?UTF-8?q?=E5=8A=A8=E4=B9=8B=E5=90=8E=EF=BC=8C=E6=AF=8F=E6=AC=A1compuare?=
 =?UTF-8?q?=20StartBlockComparisons=E9=83=BD=E4=BC=9A=E9=87=8D=E6=96=B0?=
 =?UTF-8?q?=E8=AE=A1=E7=AE=97=E5=8E=9F=E5=A7=8B=E5=9B=BE=E7=89=87=E7=9A=84?=
 =?UTF-8?q?opsin=20=E4=BC=98=E5=8C=96=E6=8E=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl.cpp                    | 158 ++++++++++++------
 clguetzli/clguetzli.h                         |   2 +
 .../butteraugli/butteraugli/butteraugli.h     |   4 +-
 3 files changed, 107 insertions(+), 57 deletions(-)

diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 12e0c6c7..7bd566df 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -26,6 +26,8 @@ void set_global_size(int dim, int size){
 #define abs(exper)    fabs((exper))
 #include "clguetzli.h"
 #include "clguetzli.cl"
+#include "cuguetzli.h"
+#include "ocu.h"
 
 namespace guetzli
 {
@@ -34,12 +36,111 @@ namespace guetzli
         const float target_distance, ProcessStats* stats)
         : ButteraugliComparator(width, height, rgb, target_distance, stats)
     {
+        if (MODE_CPU != g_mathMode)
+        {
+            rgb_orig_opsin.resize(3);
+            rgb_orig_opsin[0].resize(width * height);
+            rgb_orig_opsin[1].resize(width * height);
+            rgb_orig_opsin[2].resize(width * height);
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+            const float* lut = kSrgb8ToLinearTable;
+#else
+            const double* lut = kSrgb8ToLinearTable;
+#endif
+            for (int c = 0; c < 3; ++c) {
+                for (int y = 0, ix = 0; y < height_; ++y) {
+                    for (int x = 0; x < width_; ++x, ++ix) {
+                        rgb_orig_opsin[c][ix] = lut[rgb_orig_[3 * ix + c]];
+                    }
+                }
+            }
+            ::butteraugli::OpsinDynamicsImage(width_, height_, rgb_orig_opsin);
+        }
+    }
+
+    void ButteraugliComparatorEx::Compare(const OutputImage& img)
+    {
+
+        if (MODE_OPENCL == g_mathMode)
+        {
+            std::vector<std::vector<float> > rgb1(3, std::vector<float>(width_ * height_));
+            img.ToLinearRGB(&rgb1);
+
+            const int xsize = width_;
+            const int ysize = height_;
+            std::vector<float>().swap(distmap_);
+            distmap_.resize(xsize * ysize);
+
+            size_t channel_size = xsize * ysize * sizeof(float);
+            ocl_args_d_t &ocl = getOcl();
+            ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data());
+            ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data());
+
+            cl_mem mem_result = ocl.allocMem(channel_size);
+
+            clOpsinDynamicsImageEx(xyb1, xsize, ysize);
+            clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step());
+
+            cl_int err = clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL);
+            LOG_CL_RESULT(err);
+            err = clFinish(ocl.commandQueue);
+            LOG_CL_RESULT(err);
+
+            clReleaseMemObject(mem_result);
+            ocl.releaseMemChannels(xyb0);
+            ocl.releaseMemChannels(xyb1);
+
+            distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
+        }
+#ifdef __USE_CUDA__
+        else if (MODE_CUDA == g_mathMode)
+        {
+            std::vector<std::vector<float> > rgb1(3, std::vector<float>(width_ * height_));
+            img.ToLinearRGB(&rgb1);
+
+            const int xsize = width_;
+            const int ysize = height_;
+            std::vector<float>().swap(distmap_);
+            distmap_.resize(xsize * ysize);
+
+            size_t channel_size = xsize * ysize * sizeof(float);
+            ocu_args_d_t &ocl = getOcu();
+            ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data());
+            ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data());
+            
+            cu_mem mem_result = ocl.allocMem(channel_size);
+
+            cuOpsinDynamicsImageEx(xyb1, xsize, ysize);
+
+            cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step());
+
+            cuMemcpyDtoH(distmap_.data(), mem_result, channel_size);
+
+            cuMemFree(mem_result);
+            ocl.releaseMemChannels(xyb0);
+            ocl.releaseMemChannels(xyb1);
+
+            distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
+        }
+#endif
+        else
+        {
+            ButteraugliComparator::Compare(img);
+        }
 
     }
 
     void ButteraugliComparatorEx::StartBlockComparisons()
     {
-        ButteraugliComparator::StartBlockComparisons();
+        if (MODE_CPU == g_mathMode)
+        {
+            ButteraugliComparator::StartBlockComparisons();
+            return;
+        }
+
+        std::vector<std::vector<float> > dummy(3);
+        ::butteraugli::Mask(rgb_orig_opsin, rgb_orig_opsin, width_, height_, &mask_xyz_, &dummy);
 
         const int width = width_;
         const int height = height_;
@@ -129,57 +230,4 @@ namespace guetzli
 */
         return err;
     }
-}
-
-/*
-if (MODE_OPENCL == g_mathMode)
-{
-const int xsize = width_;
-const int ysize = height_;
-std::vector<float>().swap(distmap_);
-distmap_.resize(xsize * ysize);
-
-size_t channel_size = xsize * ysize * sizeof(float);
-ocl_args_d_t &ocl = getOcl();
-ocl_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data());
-ocl_channels xyb0 = ocl.allocMemChannels(channel_size, rgb0[0].data(), rgb0[1].data(), rgb0[2].data());
-
-cl_mem mem_result = ocl.allocMem(channel_size);// , distmap_.data());
-
-clOpsinDynamicsImageEx(xyb1, xsize, ysize);
-clDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_);
-
-clEnqueueReadBuffer(ocl.commandQueue, mem_result, false, 0, channel_size, distmap_.data(), 0, NULL, NULL);
-clFinish(ocl.commandQueue);
-
-clReleaseMemObject(mem_result);
-ocl.releaseMemChannels(xyb0);
-ocl.releaseMemChannels(xyb1);
-}
-#ifdef __USE_CUDA__
-else if (MODE_CUDA == g_mathMode)
-{
-const int xsize = width_;
-const int ysize = height_;
-std::vector<float>().swap(distmap_);
-distmap_.resize(xsize * ysize);
-
-size_t channel_size = xsize * ysize * sizeof(float);
-ocu_args_d_t &ocl = getOcu();
-ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb[0].data(), rgb[1].data(), rgb[2].data());
-ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb0[0].data(), rgb0[1].data(), rgb0[2].data());
-
-cu_mem mem_result = ocl.allocMem(channel_size);// , distmap_.data());
-
-cuOpsinDynamicsImageEx(xyb1, xsize, ysize);
-
-cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, comparator_.step_);
-
-cuMemcpyDtoH(distmap_.data(), mem_result, channel_size);
-
-cuMemFree(mem_result);
-ocl.releaseMemChannels(xyb0);
-ocl.releaseMemChannels(xyb1);
-}
-#endif
-else*/
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index 7bebdd66..d5c04492 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -163,6 +163,7 @@ namespace guetzli {
             const std::vector<uint8_t>* rgb,
             const float target_distance, ProcessStats* stats);
 
+        void Compare(const OutputImage& img) override;
         void StartBlockComparisons() override;
         void FinishBlockComparisons() override;
 
@@ -170,5 +171,6 @@ namespace guetzli {
     public:
         std::vector<float> imgOpsinDynamicsBlockList;   // [RR..RRGG..GGBB..BB]:blockCount
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
+        std::vector<std::vector<float>> rgb_orig_opsin;
     };
 }
\ No newline at end of file
diff --git a/third_party/butteraugli/butteraugli/butteraugli.h b/third_party/butteraugli/butteraugli/butteraugli.h
index 6f0451c8..547fdc58 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.h
+++ b/third_party/butteraugli/butteraugli/butteraugli.h
@@ -49,7 +49,7 @@ class ButteraugliComparator {
   virtual void DiffmapOpsinDynamicsImage(std::vector<std::vector<float>> &xyb0,
                                  std::vector<std::vector<float>> &xyb1,
                                  std::vector<float> &result);
-
+  int step() { return step_;}
  protected:
   virtual void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
                     const std::vector<std::vector<float> > &rgb1,
@@ -72,7 +72,7 @@ class ButteraugliComparator {
                        const std::vector<float>& edge_detector_map,
                        std::vector<float>* result);
 
-public:
+protected:
   const size_t xsize_;
   const size_t ysize_;
   const size_t num_pixels_;

From 79bce89559a928cdaef34f757cd35cf5621f945c Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 8 Jun 2017 01:43:40 +0800
Subject: [PATCH 154/189] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=A4=84=E7=90=86png?=
 =?UTF-8?q?=E6=97=B6=E7=9A=84crash?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli/processor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 2603f3f8..86648d17 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -1056,7 +1056,7 @@ bool Process(const Params& params, ProcessStats* stats,
   std::unique_ptr<ButteraugliComparator> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
     comparator.reset(
-        new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+        new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);

From 9e8bdb31f42e92f4bacf3e4ebb110314c5d481a6 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 8 Jun 2017 16:45:26 +0800
Subject: [PATCH 155/189] =?UTF-8?q?=E8=8A=82=E7=9C=81clComputeBlockZeroing?=
 =?UTF-8?q?OrderEx=E8=BF=87=E7=A8=8B=E4=B8=AD=E7=9A=84=E5=86=97=E4=BD=99?=
 =?UTF-8?q?=E8=AE=A1=E7=AE=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clguetzli.cl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 6ddc81c5..f0e16db0 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -832,6 +832,10 @@ __kernel void clComputeBlockZeroingOrderEx(
             mayout_block[idx] = old_coeff;
         }
 
+        if (best_err >= BlockErrorLimit)
+        {   // err������������ģ���������Ѿ�����ErrorLimit�������ļ�������������
+            break;
+        }
         int idx = input_order.pData[best_i].idx;
         mayout_block[idx] = 0;
         list_erase(&input_order, best_i);

From 43834f77af609f7f056af9b87d9e8e2520744b45 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 8 Jun 2017 16:50:08 +0800
Subject: [PATCH 156/189] =?UTF-8?q?=E9=9D=99=E6=80=81=E5=BA=93=E7=BC=96?=
 =?UTF-8?q?=E8=AF=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli_static.vcxproj | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj
index 1d4d4e3f..05a75f9a 100644
--- a/guetzli_static.vcxproj
+++ b/guetzli_static.vcxproj
@@ -93,7 +93,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -110,7 +110,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -127,7 +127,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>
@@ -140,7 +140,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
     </ClCompile>

From e922dbf968fb1a5112ef675f3c5bd59fd126d13a Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 8 Jun 2017 21:44:23 +0800
Subject: [PATCH 157/189] =?UTF-8?q?=E7=BC=96=E8=AF=91=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index e6e2be45..89a05fe2 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -170,7 +170,7 @@
       <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ENABLE_OPENCL;ENABLE_OPENCL_CHECK;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>

From 230924bfee5dae001c94e57429bae0cae7b48104 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 9 Jun 2017 10:03:40 +0800
Subject: [PATCH 158/189] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC=EF=BC=8C=E6=94=AF=E6=8C=81=E7=9B=AE=E5=BD=95?=
 =?UTF-8?q?=E6=89=B9=E9=87=8F=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 89a05fe2..86da4aa7 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -108,7 +108,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>__USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>

From 0e0edb11d4ddf061863ac4837e5bee61d8d2eac1 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 12 Jun 2017 10:52:15 +0800
Subject: [PATCH 159/189] Merge branch 'master' of
 https://github.com/ianhuang-777/guetzli

Conflicts:
	tests_tencent/testLinux/cp2test.sh
---
 compile.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 compile.sh

diff --git a/compile.sh b/compile.sh
old mode 100644
new mode 100755

From c31af45a96cc98d31c98ed1305e095b5c58eda60 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Mon, 12 Jun 2017 16:58:38 +0800
Subject: [PATCH 160/189] =?UTF-8?q?c=E4=BC=98=E5=8C=96=E9=80=89=E9=A1=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.make                                       | 1 +
 guetzli/processor.cc                               | 6 ++++++
 third_party/butteraugli/butteraugli/butteraugli.cc | 9 +++++++++
 3 files changed, 16 insertions(+)

diff --git a/guetzli.make b/guetzli.make
index 3675ba0d..10231ff0 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -16,6 +16,7 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
   DEFINES += -D__USE_CUDA__
+  DEFINES += -D__USE_C__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 86648d17..432c62f5 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -450,6 +450,12 @@ void Processor::ComputeBlockZeroingOrder(
             block_x, block_y, &processed_block[c * kDCTBlockSize]);
       }
     }
+#ifdef __USE_C__
+    if (best_err >= comparator_->BlockErrorLimit())
+    {   // err������������ģ���������Ѿ�����ErrorLimit�������ļ�������������
+        break;
+    }
+#endif
   }
   // Make the block error values monotonic.
   float min_err = 1e10;
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index 74abc526..c21eb6b1 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1319,9 +1319,18 @@ void _MinSquareVal(size_t square_size, size_t offset,
     const size_t minh = offset > y ? 0 : y - offset;
     const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
     for (size_t x = 0; x < xsize; ++x) {
+#ifdef __USE_C__
+      float min = values[x + minh * xsize];
+#else
       double min = values[x + minh * xsize];
+#endif
       for (size_t j = minh + 1; j < maxh; ++j) {
+#ifdef __USE_C__
+        min = std::min<float>(min, values[x + j * xsize]);
+#else
         min = fmin(min, values[x + j * xsize]);
+#endif
+
       }
       tmp[x + y * xsize] = static_cast<float>(min);
     }

From 891def12293474cf3d12a65625617e8bb86a4459 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 13 Jun 2017 00:23:09 +0800
Subject: [PATCH 161/189] =?UTF-8?q?=E4=BC=98=E5=8C=96c=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/butteraugli/butteraugli/butteraugli.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index c21eb6b1..b62e1578 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -1326,7 +1326,8 @@ void _MinSquareVal(size_t square_size, size_t offset,
 #endif
       for (size_t j = minh + 1; j < maxh; ++j) {
 #ifdef __USE_C__
-        min = std::min<float>(min, values[x + j * xsize]);
+          float tmpf = values[x + j * xsize];
+          if (tmpf < min) min = tmpf;
 #else
         min = fmin(min, values[x + j * xsize]);
 #endif
@@ -1341,7 +1342,12 @@ void _MinSquareVal(size_t square_size, size_t offset,
     for (size_t y = 0; y < ysize; ++y) {
       double min = tmp[minw + y * xsize];
       for (size_t j = minw + 1; j < maxw; ++j) {
+#ifdef __USE_C__
+          float tmpf = tmp[j + y * xsize];
+          if (tmpf < min) min = tmpf;
+#else
         min = fmin(min, tmp[j + y * xsize]);
+#endif
       }
       values[x + y * xsize] = static_cast<float>(min);
     }

From 1f26bc048da138b2c490d09ec45ecf1c964c2713 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 13 Jun 2017 23:34:25 +0800
Subject: [PATCH 162/189] =?UTF-8?q?=E4=B8=8D=E4=BC=98=E5=8C=96c=E4=BA=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.make | 1 -
 1 file changed, 1 deletion(-)

diff --git a/guetzli.make b/guetzli.make
index 10231ff0..3675ba0d 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -16,7 +16,6 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
   DEFINES += -D__USE_CUDA__
-  DEFINES += -D__USE_C__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)

From 742b284fdb637f4b38d0393b55355bb06e20fb14 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Thu, 15 Jun 2017 16:14:12 +0800
Subject: [PATCH 163/189] =?UTF-8?q?=E4=BC=98=E5=8C=96c=E7=89=88=E6=9C=AC?=
 =?UTF-8?q?=20-double=E8=BD=AC=E4=B8=BAfloat=20-blockerror=E6=88=AA?=
 =?UTF-8?q?=E6=96=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 clguetzli/clbutter_comparator.cpp | 1467 ++++++++++++++++++++++++++++-
 clguetzli/clbutter_comparator.h   |   23 +
 clguetzli/clguetzli.cl.cpp        |   17 +-
 clguetzli/clguetzli.h             |    1 +
 guetzli/guetzli.cc                |    5 +
 guetzli/processor.cc              |   97 +-
 6 files changed, 1552 insertions(+), 58 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 0331940b..58e76e54 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -2,6 +2,1262 @@
 #include "clguetzli.h"
 #include "clguetzli_test.h"
 
+#include <algorithm>
+#include <array>
+
+namespace butteraugli {
+
+static const float kInternalGoodQualityThreshold = 14.921561160295326;
+static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+inline float DotProductOpt(const float u[3], const float v[3]) {
+	return u[0] * v[0] + u[1] * v[1] + u[2] * v[2];
+}
+
+// Computes a horizontal convolution and transposes the result.
+void ConvolutionOpt(size_t xsize, size_t ysize,
+	size_t xstep,
+	size_t len, size_t offset,
+	const float* __restrict__ multipliers,
+	const float* __restrict__ inp,
+	float border_ratio,
+	float* __restrict__ result) {
+	PROFILER_FUNC;
+	float weight_no_border = 0;
+	for (size_t j = 0; j <= 2 * offset; ++j) {
+		weight_no_border += multipliers[j];
+	}
+	for (size_t x = 0, ox = 0; x < xsize; x += xstep, ox++) {
+		int minx = x < offset ? 0 : x - offset;
+		int maxx = std::min(xsize, x + len - offset) - 1;
+		float weight = 0.0;
+		for (int j = minx; j <= maxx; ++j) {
+			weight += multipliers[j - x + offset];
+		}
+		// Interpolate linearly between the no-border scaling and border scaling.
+		weight = (1.0 - border_ratio) * weight + border_ratio * weight_no_border;
+		float scale = 1.0 / weight;
+		for (size_t y = 0; y < ysize; ++y) {
+			float sum = 0.0;
+			for (int j = minx; j <= maxx; ++j) {
+				sum += inp[y * xsize + j] * multipliers[j - x + offset];
+			}
+			result[ox * ysize + y] = static_cast<float>(sum * scale);
+		}
+	}
+}
+
+void BlurOpt(size_t xsize, size_t ysize, float* channel, float sigma,
+	float border_ratio) {
+	PROFILER_FUNC;
+	float m = 2.25;  // Accuracy increases when m is increased.
+	const float scaler = -1.0 / (2 * sigma * sigma);
+	// For m = 9.0: exp(-scaler * diff * diff) < 2^ {-52}
+	const int diff = std::max<int>(1, m * fabs(sigma));
+	const int expn_size = 2 * diff + 1;
+	std::vector<float> expn(expn_size);
+	for (int i = -diff; i <= diff; ++i) {
+		expn[i + diff] = static_cast<float>(exp(scaler * i * i));
+	}
+	const int xstep = std::max(1, int(sigma / 3));
+	const int ystep = xstep;
+	int dxsize = (xsize + xstep - 1) / xstep;
+	int dysize = (ysize + ystep - 1) / ystep;
+	std::vector<float> tmp(dxsize * ysize);
+	ConvolutionOpt(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
+		border_ratio,
+		tmp.data());
+	float* output = channel;
+	std::vector<float> downsampled_output;
+	if (xstep > 1) {
+		downsampled_output.resize(dxsize * dysize);
+		output = downsampled_output.data();
+	}
+	ConvolutionOpt(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
+		border_ratio, output);
+	if (xstep > 1) {
+		for (size_t y = 0; y < ysize; y++) {
+			for (size_t x = 0; x < xsize; x++) {
+				// TODO: Use correct rounding.
+				channel[y * xsize + x] =
+					downsampled_output[(y / ystep) * dxsize + (x / xstep)];
+			}
+		}
+	}
+}
+
+// To change this to n, add the relevant FFTn function and kFFTnMapIndexTable.
+constexpr size_t kBlockEdge = 8;
+constexpr size_t kBlockSize = kBlockEdge * kBlockEdge;
+constexpr size_t kBlockEdgeHalf = kBlockEdge / 2;
+constexpr size_t kBlockHalf = kBlockEdge * kBlockEdgeHalf;
+
+// Contrast sensitivity related weights.
+static const float *GetContrastSensitivityMatrixOpt() {
+	static float csf8x8[kBlockHalf + kBlockEdgeHalf + 1] = {
+		5.28270670524,
+		0.0,
+		0.0,
+		0.0,
+		0.3831134973,
+		0.676303603859,
+		3.58927792424,
+		18.6104367002,
+		18.6104367002,
+		3.09093131948,
+		1.0,
+		0.498250875965,
+		0.36198671102,
+		0.308982169883,
+		0.1312701920435,
+		2.37370549629,
+		3.58927792424,
+		1.0,
+		2.37370549629,
+		0.991205724152,
+		1.05178802919,
+		0.627264168628,
+		0.4,
+		0.1312701920435,
+		0.676303603859,
+		0.498250875965,
+		0.991205724152,
+		0.5,
+		0.3831134973,
+		0.349686450518,
+		0.627264168628,
+		0.308982169883,
+		0.3831134973,
+		0.36198671102,
+		1.05178802919,
+		0.3831134973,
+		0.12,
+	};
+	return &csf8x8[0];
+}
+
+std::array<float, 21> MakeHighFreqColorDiffDxOpt() {
+	std::array<float, 21> lut;
+	static const float off = 11.38708334481672;
+	static const float inc = 14.550189611520716;
+	lut[0] = 0.0;
+	lut[1] = off;
+	for (int i = 2; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const float *GetHighFreqColorDiffDxOpt() {
+	static const std::array<float, 21> kLut = MakeHighFreqColorDiffDxOpt();
+	return kLut.data();
+}
+
+std::array<float, 21> MakeHighFreqColorDiffDyOpt() {
+	std::array<float, 21> lut;
+	static const float off = 1.4103373714040413;
+	static const float inc = 0.7084088867024;
+	lut[0] = 0.0;
+	lut[1] = off;
+	for (int i = 2; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const float *GetHighFreqColorDiffDyOpt() {
+	static const std::array<float, 21> kLut = MakeHighFreqColorDiffDyOpt();
+	return kLut.data();
+}
+
+std::array<float, 21> MakeLowFreqColorDiffDyOpt() {
+	std::array<float, 21> lut;
+	static const float inc = 5.2511644570349185;
+	lut[0] = 0.0;
+	for (int i = 1; i < 21; ++i) {
+		lut[i] = lut[i - 1] + inc;
+	}
+	return lut;
+}
+
+const float *GetLowFreqColorDiffDyOpt() {
+	static const std::array<float, 21> kLut = MakeLowFreqColorDiffDyOpt();
+	return kLut.data();
+}
+
+inline float InterpolateOpt(const float *array, int size, float sx) {
+	float ix = fabs(sx);
+	assert(ix < 10000);
+	int baseix = static_cast<int>(ix);
+	float res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		float mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	if (sx < 0) res = -res;
+	return res;
+}
+
+inline float InterpolateClampNegativeOpt(const float *array,
+	int size, float sx) {
+	if (sx < 0) {
+		sx = 0;
+	}
+	float ix = fabs(sx);
+	int baseix = static_cast<int>(ix);
+	float res;
+	if (baseix >= size - 1) {
+		res = array[size - 1];
+	}
+	else {
+		float mix = ix - baseix;
+		int nextix = baseix + 1;
+		res = array[baseix] + mix * (array[nextix] - array[baseix]);
+	}
+	return res;
+}
+
+void RgbToXybOpt(float r, float g, float b,
+	float *valx, float *valy, float *valz) {
+	static const float a0 = 1.01611726948;
+	static const float a1 = 0.982482243696;
+	static const float a2 = 1.43571362627;
+	static const float a3 = 0.896039849412;
+	*valx = a0 * r - a1 * g;
+	*valy = a2 * r + a3 * g;
+	*valz = b;
+}
+
+static inline void XybToValsOpt(float x, float y, float z,
+	float *valx, float *valy, float *valz) {
+	static const float xmul = 0.758304045695;
+	static const float ymul = 2.28148649801;
+	static const float zmul = 1.87816926918;
+	*valx = InterpolateOpt(GetHighFreqColorDiffDxOpt(), 21, x * xmul);
+	*valy = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y * ymul);
+	*valz = zmul * z;
+}
+
+// Rough psychovisual distance to gray for low frequency colors.
+static void XybLowFreqToValsOpt(float x, float y, float z,
+	float *valx, float *valy, float *valz) {
+	static const float xmul = 6.64482198135;
+	static const float ymul = 0.837846224276;
+	static const float zmul = 7.34905756986;
+	static const float y_to_z_mul = 0.0812519812628;
+	z += y_to_z_mul * y;
+	*valz = z * zmul;
+	*valx = x * xmul;
+	*valy = InterpolateOpt(GetLowFreqColorDiffDyOpt(), 21, y * ymul);
+}
+
+float RemoveRangeAroundZeroOpt(float v, float range) {
+	if (v >= -range && v < range) {
+		return 0;
+	}
+	if (v < 0) {
+		return v + range;
+	}
+	else {
+		return v - range;
+	}
+}
+
+void XybDiffLowFreqSquaredAccumulateOpt(float r0, float g0, float b0,
+	float r1, float g1, float b1,
+	float factor, float res[3]) {
+	float valx0, valy0, valz0;
+	float valx1, valy1, valz1;
+	XybLowFreqToValsOpt(r0, g0, b0, &valx0, &valy0, &valz0);
+	if (r1 == 0.0 && g1 == 0.0 && b1 == 0.0) {
+		PROFILER_ZONE("XybDiff r1=g1=b1=0");
+		res[0] += factor * valx0 * valx0;
+		res[1] += factor * valy0 * valy0;
+		res[2] += factor * valz0 * valz0;
+		return;
+	}
+	XybLowFreqToValsOpt(r1, g1, b1, &valx1, &valy1, &valz1);
+	// Approximate the distance of the colors by their respective distances
+	// to gray.
+	float valx = valx0 - valx1;
+	float valy = valy0 - valy1;
+	float valz = valz0 - valz1;
+	res[0] += factor * valx * valx;
+	res[1] += factor * valy * valy;
+	res[2] += factor * valz * valz;
+}
+
+struct ComplexOpt {
+public:
+	float real;
+	float imag;
+};
+
+inline float abssq(const ComplexOpt& c) {
+	return c.real * c.real + c.imag * c.imag;
+}
+
+static void TransposeBlock(ComplexOpt data[kBlockSize]) {
+	for (int i = 0; i < kBlockEdge; i++) {
+		for (int j = 0; j < i; j++) {
+			std::swap(data[kBlockEdge * i + j], data[kBlockEdge * j + i]);
+		}
+	}
+}
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 4 elements.
+inline void FFT4Opt(ComplexOpt* a) {
+	float t1, t2, t3, t4, t5, t6, t7, t8;
+	t5 = a[2].real;
+	t1 = a[0].real - t5;
+	t7 = a[3].real;
+	t5 += a[0].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	t8 = t5 + t7;
+	a[0].real = t8;
+	t5 -= t7;
+	a[1].real = t5;
+	t6 = a[2].imag;
+	t2 = a[0].imag - t6;
+	t6 += a[0].imag;
+	t5 = a[3].imag;
+	a[2].imag = t2 + t3;
+	t2 -= t3;
+	a[3].imag = t2;
+	t4 = a[1].imag - t5;
+	a[3].real = t1 + t4;
+	t1 -= t4;
+	a[2].real = t1;
+	t5 += a[1].imag;
+	a[0].imag = t6 + t5;
+	t6 -= t5;
+	a[1].imag = t6;
+}
+
+static const float kSqrtHalf = 0.70710678118654752440084436210484903;
+
+//  D. J. Bernstein's Fast Fourier Transform algorithm on 8 elements.
+void FFT8OptOpt(ComplexOpt* a) {
+	float t1, t2, t3, t4, t5, t6, t7, t8;
+
+	t7 = a[4].imag;
+	t4 = a[0].imag - t7;
+	t7 += a[0].imag;
+	a[0].imag = t7;
+
+	t8 = a[6].real;
+	t5 = a[2].real - t8;
+	t8 += a[2].real;
+	a[2].real = t8;
+
+	t7 = a[6].imag;
+	a[6].imag = t4 - t5;
+	t4 += t5;
+	a[4].imag = t4;
+
+	t6 = a[2].imag - t7;
+	t7 += a[2].imag;
+	a[2].imag = t7;
+
+	t8 = a[4].real;
+	t3 = a[0].real - t8;
+	t8 += a[0].real;
+	a[0].real = t8;
+
+	a[4].real = t3 - t6;
+	t3 += t6;
+	a[6].real = t3;
+
+	t7 = a[5].real;
+	t3 = a[1].real - t7;
+	t7 += a[1].real;
+	a[1].real = t7;
+
+	t8 = a[7].imag;
+	t6 = a[3].imag - t8;
+	t8 += a[3].imag;
+	a[3].imag = t8;
+	t1 = t3 - t6;
+	t3 += t6;
+
+	t7 = a[5].imag;
+	t4 = a[1].imag - t7;
+	t7 += a[1].imag;
+	a[1].imag = t7;
+
+	t8 = a[7].real;
+	t5 = a[3].real - t8;
+	t8 += a[3].real;
+	a[3].real = t8;
+
+	t2 = t4 - t5;
+	t4 += t5;
+
+	t6 = t1 - t4;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	a[5].real = a[4].real - t6;
+	t1 += t4;
+	t1 *= t8;
+	a[5].imag = a[4].imag - t1;
+	t6 += a[4].real;
+	a[4].real = t6;
+	t1 += a[4].imag;
+	a[4].imag = t1;
+
+	t5 = t2 - t3;
+	t5 *= t8;
+	a[7].imag = a[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	a[7].real = a[6].real - t2;
+	t2 += a[6].real;
+	a[6].real = t2;
+	t5 += a[6].imag;
+	a[6].imag = t5;
+
+	FFT4Opt(a);
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	ComplexOpt tmp = a[2];
+	a[2] = a[3];
+	a[3] = a[5];
+	a[5] = a[7];
+	a[7] = a[4];
+	a[4] = a[1];
+	a[1] = a[6];
+	a[6] = tmp;
+}
+
+// Same as FFT8, but all inputs are real.
+// TODO: Since this does not need to be in-place, maybe there is a
+// faster FFT than this one, which is derived from DJB's in-place complex FFT.
+void RealFFT8Opt(const float* in, ComplexOpt* out) {
+	float t1, t2, t3, t5, t6, t7, t8;
+	t8 = in[6];
+	t5 = in[2] - t8;
+	t8 += in[2];
+	out[2].real = t8;
+	out[6].imag = -t5;
+	out[4].imag = t5;
+	t8 = in[4];
+	t3 = in[0] - t8;
+	t8 += in[0];
+	out[0].real = t8;
+	out[4].real = t3;
+	out[6].real = t3;
+	t7 = in[5];
+	t3 = in[1] - t7;
+	t7 += in[1];
+	out[1].real = t7;
+	t8 = in[7];
+	t5 = in[3] - t8;
+	t8 += in[3];
+	out[3].real = t8;
+	t2 = -t5;
+	t6 = t3 - t5;
+	t8 = kSqrtHalf;
+	t6 *= t8;
+	out[5].real = out[4].real - t6;
+	t1 = t3 + t5;
+	t1 *= t8;
+	out[5].imag = out[4].imag - t1;
+	t6 += out[4].real;
+	out[4].real = t6;
+	t1 += out[4].imag;
+	out[4].imag = t1;
+	t5 = t2 - t3;
+	t5 *= t8;
+	out[7].imag = out[6].imag - t5;
+	t2 += t3;
+	t2 *= t8;
+	out[7].real = out[6].real - t2;
+	t2 += out[6].real;
+	out[6].real = t2;
+	t5 += out[6].imag;
+	out[6].imag = t5;
+	t5 = out[2].real;
+	t1 = out[0].real - t5;
+	t7 = out[3].real;
+	t5 += out[0].real;
+	t3 = out[1].real - t7;
+	t7 += out[1].real;
+	t8 = t5 + t7;
+	out[0].real = t8;
+	t5 -= t7;
+	out[1].real = t5;
+	out[2].imag = t3;
+	out[3].imag = -t3;
+	out[3].real = t1;
+	out[2].real = t1;
+	out[0].imag = 0;
+	out[1].imag = 0;
+
+	// Reorder to the correct output order.
+	// TODO: Modify the above computation so that this is not needed.
+	ComplexOpt tmp = out[2];
+	out[2] = out[3];
+	out[3] = out[5];
+	out[5] = out[7];
+	out[7] = out[4];
+	out[4] = out[1];
+	out[1] = out[6];
+	out[6] = tmp;
+}
+
+// Fills in block[kBlockEdgeHalf..(kBlockHalf+kBlockEdgeHalf)], and leaves the
+// rest unmodified.
+void ButteraugliFFTSquaredOpt(float block[kBlockSize]) {
+	float global_mul = 0.000064;
+	ComplexOpt block_c[kBlockSize];
+	assert(kBlockEdge == 8);
+	for (int y = 0; y < kBlockEdge; ++y) {
+		RealFFT8Opt(block + y * kBlockEdge, block_c + y * kBlockEdge);
+	}
+	TransposeBlock(block_c);
+	float r0[kBlockEdge];
+	float r1[kBlockEdge];
+	for (int x = 0; x < kBlockEdge; ++x) {
+		r0[x] = block_c[x].real;
+		r1[x] = block_c[kBlockHalf + x].real;
+	}
+	RealFFT8Opt(r0, block_c);
+	RealFFT8Opt(r1, block_c + kBlockHalf);
+	for (int y = 1; y < kBlockEdgeHalf; ++y) {
+		FFT8OptOpt(block_c + y * kBlockEdge);
+	}
+	for (int i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		block[i] = abssq(block_c[i]);
+		block[i] *= global_mul;
+	}
+}
+
+// Computes 8x8 FFT of each channel of xyb0 and xyb1 and adds the total squared
+// 3-dimensional xybdiff of the two blocks to diff_xyb_{dc,ac} and the average
+// diff on the edges to diff_xyb_edge_dc.
+void ButteraugliBlockDiffOpt(float xyb0[3 * kBlockSize],
+	float xyb1[3 * kBlockSize],
+	float diff_xyb_dc[3],
+	float diff_xyb_ac[3],
+	float diff_xyb_edge_dc[3]) {
+	PROFILER_FUNC;
+	const float *csf8x8 = GetContrastSensitivityMatrixOpt();
+
+	float avgdiff_xyb[3] = { 0.0 };
+	float avgdiff_edge[3][4] = { { 0.0 } };
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		const float diff_xyb = xyb0[i] - xyb1[i];
+		const int c = i / kBlockSize;
+		avgdiff_xyb[c] += diff_xyb / kBlockSize;
+		const int k = i % kBlockSize;
+		const int kx = k % kBlockEdge;
+		const int ky = k / kBlockEdge;
+		const int h_edge_idx = ky == 0 ? 1 : ky == 7 ? 3 : -1;
+		const int v_edge_idx = kx == 0 ? 0 : kx == 7 ? 2 : -1;
+		if (h_edge_idx >= 0) {
+			avgdiff_edge[c][h_edge_idx] += diff_xyb / kBlockEdge;
+		}
+		if (v_edge_idx >= 0) {
+			avgdiff_edge[c][v_edge_idx] += diff_xyb / kBlockEdge;
+		}
+	}
+	XybDiffLowFreqSquaredAccumulateOpt(avgdiff_xyb[0],
+		avgdiff_xyb[1],
+		avgdiff_xyb[2],
+		0, 0, 0, csf8x8[0],
+		diff_xyb_dc);
+	for (int i = 0; i < 4; ++i) {
+		XybDiffLowFreqSquaredAccumulateOpt(avgdiff_edge[0][i],
+			avgdiff_edge[1][i],
+			avgdiff_edge[2][i],
+			0, 0, 0, csf8x8[0],
+			diff_xyb_edge_dc);
+	}
+
+	float* xyb_avg = xyb0;
+	float* xyb_halfdiff = xyb1;
+	for (int i = 0; i < 3 * kBlockSize; ++i) {
+		float avg = (xyb0[i] + xyb1[i]) / 2;
+		float halfdiff = (xyb0[i] - xyb1[i]) / 2;
+		xyb_avg[i] = avg;
+		xyb_halfdiff[i] = halfdiff;
+	}
+	float *y_avg = &xyb_avg[kBlockSize];
+	float *x_halfdiff_squared = &xyb_halfdiff[0];
+	float *y_halfdiff = &xyb_halfdiff[kBlockSize];
+	float *z_halfdiff_squared = &xyb_halfdiff[2 * kBlockSize];
+	ButteraugliFFTSquaredOpt(y_avg);
+	ButteraugliFFTSquaredOpt(x_halfdiff_squared);
+	ButteraugliFFTSquaredOpt(y_halfdiff);
+	ButteraugliFFTSquaredOpt(z_halfdiff_squared);
+
+	static const float xmul = 64.8;
+	static const float ymul = 1.753123908348329;
+	static const float ymul2 = 1.51983458269;
+	static const float zmul = 2.4;
+
+	for (size_t i = kBlockEdgeHalf; i < kBlockHalf + kBlockEdgeHalf + 1; ++i) {
+		float d = csf8x8[i];
+		diff_xyb_ac[0] += d * xmul * x_halfdiff_squared[i];
+		diff_xyb_ac[2] += d * zmul * z_halfdiff_squared[i];
+
+		y_avg[i] = sqrt(y_avg[i]);
+		y_halfdiff[i] = sqrt(y_halfdiff[i]);
+		float y0 = y_avg[i] - y_halfdiff[i];
+		float y1 = y_avg[i] + y_halfdiff[i];
+		// Remove the impact of small absolute values.
+		// This improves the behavior with flat noise.
+		static const float ylimit = 0.04;
+		y0 = RemoveRangeAroundZeroOpt(y0, ylimit);
+		y1 = RemoveRangeAroundZeroOpt(y1, ylimit);
+		if (y0 != y1) {
+			float valy0 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y0 * ymul2);
+			float valy1 = InterpolateOpt(GetHighFreqColorDiffDyOpt(), 21, y1 * ymul2);
+			float valy = ymul * (valy0 - valy1);
+			diff_xyb_ac[1] += d * valy * valy;
+		}
+	}
+}
+
+// Low frequency edge detectors.
+// Two edge detectors are applied in each corner of the 8x8 square.
+// The squared 3-dimensional error vector is added to diff_xyb.
+void Butteraugli8x8CornerEdgeDetectorDiffOpt(
+	const size_t pos_x,
+	const size_t pos_y,
+	const size_t xsize,
+	const size_t ysize,
+	const std::vector<std::vector<float> > &blurred0,
+	const std::vector<std::vector<float> > &blurred1,
+	float diff_xyb[3]) {
+	PROFILER_FUNC;
+	int local_count = 0;
+	float local_xyb[3] = { 0 };
+	static const float w = 0.711100840192;
+	for (int k = 0; k < 4; ++k) {
+		size_t step = 3;
+		size_t offset[4][2] = { { 0, 0 },{ 0, 7 },{ 7, 0 },{ 7, 7 } };
+		size_t x = pos_x + offset[k][0];
+		size_t y = pos_y + offset[k][1];
+		if (x >= step && x + step < xsize) {
+			size_t ix = y * xsize + (x - step);
+			size_t ix2 = ix + 2 * step;
+			XybDiffLowFreqSquaredAccumulateOpt(
+				w * (blurred0[0][ix] - blurred0[0][ix2]),
+				w * (blurred0[1][ix] - blurred0[1][ix2]),
+				w * (blurred0[2][ix] - blurred0[2][ix2]),
+				w * (blurred1[0][ix] - blurred1[0][ix2]),
+				w * (blurred1[1][ix] - blurred1[1][ix2]),
+				w * (blurred1[2][ix] - blurred1[2][ix2]),
+				1.0, local_xyb);
+			++local_count;
+		}
+		if (y >= step && y + step < ysize) {
+			size_t ix = (y - step) * xsize + x;
+			size_t ix2 = ix + 2 * step * xsize;
+			XybDiffLowFreqSquaredAccumulateOpt(
+				w * (blurred0[0][ix] - blurred0[0][ix2]),
+				w * (blurred0[1][ix] - blurred0[1][ix2]),
+				w * (blurred0[2][ix] - blurred0[2][ix2]),
+				w * (blurred1[0][ix] - blurred1[0][ix2]),
+				w * (blurred1[1][ix] - blurred1[1][ix2]),
+				w * (blurred1[2][ix] - blurred1[2][ix2]),
+				1.0, local_xyb);
+			++local_count;
+		}
+	}
+	static const float weight = 0.01617112696;
+	const float mul = weight * 8.0 / local_count;
+	for (int i = 0; i < 3; ++i) {
+		diff_xyb[i] += mul * local_xyb[i];
+	}
+}
+
+// https://en.wikipedia.org/wiki/Photopsin absordance modeling.
+const float *GetOpsinAbsorbanceOpt() {
+	static const float kMix[12] = {
+		0.348036746003,
+		0.577814843137,
+		0.0544556093735,
+		0.774145581713,
+		0.26922717275,
+		0.767247733938,
+		0.0366922708552,
+		0.920130265014,
+		0.0882062883536,
+		0.158581714673,
+		0.712857943858,
+		10.6524069248,
+	};
+	return &kMix[0];
+}
+
+void OpsinAbsorbanceOpt(const float in[3], float out[3]) {
+	const float *mix = GetOpsinAbsorbanceOpt();
+	out[0] = mix[0] * in[0] + mix[1] * in[1] + mix[2] * in[2] + mix[3];
+	out[1] = mix[4] * in[0] + mix[5] * in[1] + mix[6] * in[2] + mix[7];
+	out[2] = mix[8] * in[0] + mix[9] * in[1] + mix[10] * in[2] + mix[11];
+}
+
+float GammaMinArgOpt() {
+	float in[3] = { 0.0, 0.0, 0.0 };
+	float out[3];
+	OpsinAbsorbanceOpt(in, out);
+	return std::min(out[0], std::min(out[1], out[2]));
+}
+
+float GammaMaxArgOpt() {
+	float in[3] = { 255.0, 255.0, 255.0 };
+	float out[3];
+	OpsinAbsorbanceOpt(in, out);
+	return std::max(out[0], std::max(out[1], out[2]));
+}
+
+void MaskHighIntensityChangeOpt(
+	size_t xsize, size_t ysize,
+	const std::vector<std::vector<float> > &c0,
+	const std::vector<std::vector<float> > &c1,
+	std::vector<std::vector<float> > &xyb0,
+	std::vector<std::vector<float> > &xyb1) {
+	PROFILER_FUNC;
+	for (size_t y = 0; y < ysize; ++y) {
+		for (size_t x = 0; x < xsize; ++x) {
+			size_t ix = y * xsize + x;
+			const float ave[3] = {
+				(c0[0][ix] + c1[0][ix]) * 0.5,
+				(c0[1][ix] + c1[1][ix]) * 0.5,
+				(c0[2][ix] + c1[2][ix]) * 0.5,
+			};
+			float sqr_max_diff = -1;
+			{
+				int offset[4] =
+				{ -1, 1, -static_cast<int>(xsize), static_cast<int>(xsize) };
+				int border[4] =
+				{ x == 0, x + 1 == xsize, y == 0, y + 1 == ysize };
+				for (int dir = 0; dir < 4; ++dir) {
+					if (border[dir]) {
+						continue;
+					}
+					const int ix2 = ix + offset[dir];
+					float diff = 0.5 * (c0[1][ix2] + c1[1][ix2]) - ave[1];
+					diff *= diff;
+					if (sqr_max_diff < diff) {
+						sqr_max_diff = diff;
+					}
+				}
+			}
+			static const float kReductionX = 275.19165240059317;
+			static const float kReductionY = 18599.41286306991;
+			static const float kReductionZ = 410.8995306951065;
+			static const float kChromaBalance = 106.95800948271017;
+			float chroma_scale = kChromaBalance / (ave[1] + kChromaBalance);
+
+			const float mix[3] = {
+				chroma_scale * kReductionX / (sqr_max_diff + kReductionX),
+				kReductionY / (sqr_max_diff + kReductionY),
+				chroma_scale * kReductionZ / (sqr_max_diff + kReductionZ),
+			};
+			// Interpolate lineraly between the average color and the actual
+			// color -- to reduce the importance of this pixel.
+			for (int i = 0; i < 3; ++i) {
+				xyb0[i][ix] = static_cast<float>(mix[i] * c0[i][ix] + (1 - mix[i]) * ave[i]);
+				xyb1[i][ix] = static_cast<float>(mix[i] * c1[i][ix] + (1 - mix[i]) * ave[i]);
+			}
+		}
+	}
+}
+
+float SimpleGammaOpt(float v) {
+	static const float kGamma = 0.387494322593;
+	static const float limit = 43.01745241042018;
+	float bright = v - limit;
+	if (bright >= 0) {
+		static const float mul = 0.0383723643799;
+		v -= bright * mul;
+	}
+	static const float limit2 = 94.68634353321337;
+	float bright2 = v - limit2;
+	if (bright2 >= 0) {
+		static const float mul = 0.22885405968;
+		v -= bright2 * mul;
+	}
+	static const float offset = 0.156775786057;
+	static const float scale = 8.898059160493739;
+	float retval = scale * (offset + pow(v, kGamma));
+	return retval;
+}
+
+// Polynomial evaluation via Clenshaw's scheme (similar to Horner's).
+// Template enables compile-time unrolling of the recursion, but must reside
+// outside of a class due to the specialization.
+template <int INDEX>
+static inline void ClenshawRecursionOpt(const float x, const float *coefficients,
+	float *b1, float *b2) {
+	const float x_b1 = x * (*b1);
+	const float t = (x_b1 + x_b1) - (*b2) + coefficients[INDEX];
+	*b2 = *b1;
+	*b1 = t;
+
+	ClenshawRecursionOpt<INDEX - 1>(x, coefficients, b1, b2);
+}
+
+// Base case
+template <>
+inline void ClenshawRecursionOpt<0>(const float x, const float *coefficients,
+	float *b1, float *b2) {
+	const float x_b1 = x * (*b1);
+	// The final iteration differs - no 2 * x_b1 here.
+	*b1 = x_b1 - (*b2) + coefficients[0];
+}
+
+// Rational polynomial := dividing two polynomial evaluations. These are easier
+// to find than minimax polynomials.
+struct RationalPolynomialOpt {
+	template <int N>
+	static float EvaluatePolynomial(const float x,
+		const float(&coefficients)[N]) {
+		float b1 = 0.0;
+		float b2 = 0.0;
+		ClenshawRecursionOpt<N - 1>(x, coefficients, &b1, &b2);
+		return b1;
+	}
+
+	// Evaluates the polynomial at x (in [min_value, max_value]).
+	inline float operator()(const float x) const {
+		// First normalize to [0, 1].
+		const float x01 = (x - min_value) / (max_value - min_value);
+		// And then to [-1, 1] domain of Chebyshev polynomials.
+		const float xc = 2.0 * x01 - 1.0;
+
+		const float yp = EvaluatePolynomial(xc, p);
+		const float yq = EvaluatePolynomial(xc, q);
+		if (yq == 0.0) return 0.0;
+		return static_cast<float>(yp / yq);
+	}
+
+	// Domain of the polynomials; they are undefined elsewhere.
+	float min_value;
+	float max_value;
+
+	// Coefficients of T_n (Chebyshev polynomials of the first kind).
+	// Degree 5/5 is a compromise between accuracy (0.1%) and numerical stability.
+	float p[5 + 1];
+	float q[5 + 1];
+};
+
+static inline float GammaPolynomialOpt(float value) {
+	// Generated by gamma_polynomial.m from equispaced x/gamma(x) samples.
+	static const RationalPolynomialOpt r = {
+		0.770000000000000, 274.579999999999984,
+		{
+			881.979476556478289, 1496.058452015812463, 908.662212739659481,
+			373.566100223287378, 85.840860336314364, 6.683258861509244,
+		},
+		{
+			12.262350348616792, 20.557285797683576, 12.161463238367844,
+			4.711532733641639, 0.899112889751053, 0.035662329617191,
+		} };
+	return static_cast<float>(r(value));
+}
+
+static inline float GammaOpt(float v) {
+	// return SimpleGamma(v);
+	return GammaPolynomialOpt(static_cast<float>(v));
+}
+
+void OpsinDynamicsImageOpt(size_t xsize, size_t ysize,
+	std::vector<std::vector<float> > &rgb) {
+	PROFILER_FUNC;
+	std::vector<std::vector<float> > blurred = rgb;
+	static const float kSigma = 1.1;
+	for (int i = 0; i < 3; ++i) {
+		BlurOpt(xsize, ysize, blurred[i].data(), kSigma, 0.0);
+	}
+	for (size_t i = 0; i < rgb[0].size(); ++i) {
+		float sensitivity[3];
+		{
+			// Calculate sensitivity[3] based on the smoothed image gamma derivative.
+			float pre_rgb[3] = { blurred[0][i], blurred[1][i], blurred[2][i] };
+			float pre_mixed[3];
+			OpsinAbsorbanceOpt(pre_rgb, pre_mixed);
+			sensitivity[0] = GammaOpt(pre_mixed[0]) / pre_mixed[0];
+			sensitivity[1] = GammaOpt(pre_mixed[1]) / pre_mixed[1];
+			sensitivity[2] = GammaOpt(pre_mixed[2]) / pre_mixed[2];
+		}
+		float cur_rgb[3] = { rgb[0][i],  rgb[1][i],  rgb[2][i] };
+		float cur_mixed[3];
+		OpsinAbsorbanceOpt(cur_rgb, cur_mixed);
+		cur_mixed[0] *= sensitivity[0];
+		cur_mixed[1] *= sensitivity[1];
+		cur_mixed[2] *= sensitivity[2];
+		float x, y, z;
+		RgbToXybOpt(cur_mixed[0], cur_mixed[1], cur_mixed[2], &x, &y, &z);
+		rgb[0][i] = static_cast<float>(x);
+		rgb[1][i] = static_cast<float>(y);
+		rgb[2][i] = static_cast<float>(z);
+	}
+}
+
+void ScaleImageOpt(float scale, std::vector<float> *result) {
+	PROFILER_FUNC;
+	for (size_t i = 0; i < result->size(); ++i) {
+		(*result)[i] *= static_cast<float>(scale);
+	}
+}
+
+// Making a cluster of local errors to be more impactful than
+// just a single error.
+void CalculateDiffmapOpt(const size_t xsize, const size_t ysize,
+	const size_t step,
+	std::vector<float>* diffmap) {
+	PROFILER_FUNC;
+	// Shift the diffmap more correctly above the pixels, from 2.5 pixels to 0.5
+	// pixels distance over the original image. The border of 2 pixels on top and
+	// left side and 3 pixels on right and bottom side are zeroed, but these
+	// values have no meaning, they only exist to keep the result map the same
+	// size as the input images.
+	int s2 = (8 - step) / 2;
+	{
+		// Upsample and take square root.
+		std::vector<float> diffmap_out(xsize * ysize);
+		const size_t res_xsize = (xsize + step - 1) / step;
+		for (size_t res_y = 0; res_y + 8 - step < ysize; res_y += step) {
+			for (size_t res_x = 0; res_x + 8 - step < xsize; res_x += step) {
+				size_t res_ix = (res_y * res_xsize + res_x) / step;
+				float orig_val = (*diffmap)[res_ix];
+				constexpr float kInitialSlope = 100;
+				// TODO(b/29974893): Until that is fixed do not call sqrt on very small
+				// numbers.
+				float val = orig_val < (1.0 / (kInitialSlope * kInitialSlope))
+					? kInitialSlope * orig_val
+					: std::sqrt(orig_val);
+				for (size_t off_y = 0; off_y < step; ++off_y) {
+					for (size_t off_x = 0; off_x < step; ++off_x) {
+						diffmap_out[(res_y + off_y + s2) * xsize +
+							res_x + off_x + s2] = val;
+					}
+				}
+			}
+		}
+		*diffmap = diffmap_out;
+	}
+	{
+		static const float kSigma = 8.8510880283;
+		static const float mul1 = 24.8235314874;
+		static const float scale = 1.0 / (1.0 + mul1);
+		const int s = 8 - step;
+		std::vector<float> blurred((xsize - s) * (ysize - s));
+		for (size_t y = 0; y < ysize - s; ++y) {
+			for (size_t x = 0; x < xsize - s; ++x) {
+				blurred[y * (xsize - s) + x] = (*diffmap)[(y + s2) * xsize + x + s2];
+			}
+		}
+		static const float border_ratio = 0.03027655136;
+		BlurOpt(xsize - s, ysize - s, blurred.data(), kSigma, border_ratio);
+		for (size_t y = 0; y < ysize - s; ++y) {
+			for (size_t x = 0; x < xsize - s; ++x) {
+				(*diffmap)[(y + s2) * xsize + x + s2]
+					+= static_cast<float>(mul1) * blurred[y * (xsize - s) + x];
+			}
+		}
+		ScaleImageOpt(scale, diffmap);
+	}
+}
+
+static std::array<float, 512> MakeMaskOpt(
+	float extmul, float extoff,
+	float mul, float offset,
+	float scaler) {
+	std::array<float, 512> lut;
+	for (size_t i = 0; i < lut.size(); ++i) {
+		const float c = mul / ((0.01 * scaler * i) + offset);
+		lut[i] = 1.0 + extmul * (c + extoff);
+		assert(lut[i] >= 0.0);
+		lut[i] *= lut[i];
+	}
+	return lut;
+}
+
+float MaskXOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.975741017749;
+	static const float extoff = -4.25328244168;
+	static const float offset = 0.454909521427;
+	static const float scaler = 0.0738288224836;
+	static const float mul = 20.8029176447;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskYOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.373995618954;
+	static const float extoff = 1.5307267433;
+	static const float offset = 0.911952641929;
+	static const float scaler = 1.1731667845;
+	static const float mul = 16.2447033988;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskBOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.61582234137;
+	static const float extoff = -4.25376118646;
+	static const float offset = 1.05105070921;
+	static const float scaler = 0.47434643535;
+	static const float mul = 31.1444967089;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskDcXOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 1.79116943438;
+	static const float extoff = -3.86797479189;
+	static const float offset = 0.670960225853;
+	static const float scaler = 0.486575865525;
+	static const float mul = 20.4563479139;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskDcYOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.212223514236;
+	static const float extoff = -3.65647120524;
+	static const float offset = 1.73396799447;
+	static const float scaler = 0.170392660501;
+	static const float mul = 21.6566724788;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+float MaskDcBOpt(float delta) {
+	PROFILER_FUNC;
+	static const float extmul = 0.349376011816;
+	static const float extoff = -0.894711072781;
+	static const float offset = 0.901647926679;
+	static const float scaler = 0.380086095024;
+	static const float mul = 18.0373825149;
+	static const std::array<float, 512> lut =
+		MakeMaskOpt(extmul, extoff, mul, offset, scaler);
+	return InterpolateClampNegativeOpt(lut.data(), lut.size(), delta);
+}
+
+// Replaces values[x + y * xsize] with the minimum of the values in the
+// square_size square with coordinates
+//   x - offset .. x + square_size - offset - 1,
+//   y - offset .. y + square_size - offset - 1.
+void MinSquareValOpt(size_t square_size, size_t offset,
+	size_t xsize, size_t ysize,
+	float *values) {
+	PROFILER_FUNC;
+	// offset is not negative and smaller than square_size.
+	assert(offset < square_size);
+	std::vector<float> tmp(xsize * ysize);
+	for (size_t y = 0; y < ysize; ++y) {
+		const size_t minh = offset > y ? 0 : y - offset;
+		const size_t maxh = std::min<size_t>(ysize, y + square_size - offset);
+		for (size_t x = 0; x < xsize; ++x) {
+			float min = values[x + minh * xsize];
+			for (size_t j = minh + 1; j < maxh; ++j) {
+				float tmpf = values[x + j * xsize];
+				if (tmpf < min) min = tmpf;
+			}
+			tmp[x + y * xsize] = static_cast<float>(min);
+		}
+	}
+	for (size_t x = 0; x < xsize; ++x) {
+		const size_t minw = offset > x ? 0 : x - offset;
+		const size_t maxw = std::min<size_t>(xsize, x + square_size - offset);
+		for (size_t y = 0; y < ysize; ++y) {
+			float min = tmp[minw + y * xsize];
+			for (size_t j = minw + 1; j < maxw; ++j) {
+				float tmpf = tmp[j + y * xsize];
+				if (tmpf < min) min = tmpf;
+			}
+			values[x + y * xsize] = static_cast<float>(min);
+		}
+	}
+}
+
+void Average5x5Opt(int xsize, int ysize, std::vector<float>* diffs) {
+	PROFILER_FUNC;
+	if (xsize < 4 || ysize < 4) {
+		// TODO: Make this work for small dimensions as well.
+		return;
+	}
+	static const float w = 0.679144890667f;
+	static const float scale = 1.0f / (5.0f + 4 * w);
+	std::vector<float> result = *diffs;
+	std::vector<float> tmp0 = *diffs;
+	std::vector<float> tmp1 = *diffs;
+	ScaleImage(w, &tmp1);
+	for (int y = 0; y < ysize; y++) {
+		const int row0 = y * xsize;
+		result[row0 + 1] += tmp0[row0];
+		result[row0 + 0] += tmp0[row0 + 1];
+		result[row0 + 2] += tmp0[row0 + 1];
+		for (int x = 2; x < xsize - 2; ++x) {
+			result[row0 + x - 1] += tmp0[row0 + x];
+			result[row0 + x + 1] += tmp0[row0 + x];
+		}
+		result[row0 + xsize - 3] += tmp0[row0 + xsize - 2];
+		result[row0 + xsize - 1] += tmp0[row0 + xsize - 2];
+		result[row0 + xsize - 2] += tmp0[row0 + xsize - 1];
+		if (y > 0) {
+			const int rowd1 = row0 - xsize;
+			result[rowd1 + 1] += tmp1[row0];
+			result[rowd1 + 0] += tmp0[row0];
+			for (int x = 1; x < xsize - 1; ++x) {
+				result[rowd1 + x + 1] += tmp1[row0 + x];
+				result[rowd1 + x + 0] += tmp0[row0 + x];
+				result[rowd1 + x - 1] += tmp1[row0 + x];
+			}
+			result[rowd1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			result[rowd1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+		if (y + 1 < ysize) {
+			const int rowu1 = row0 + xsize;
+			result[rowu1 + 1] += tmp1[row0];
+			result[rowu1 + 0] += tmp0[row0];
+			for (int x = 1; x < xsize - 1; ++x) {
+				result[rowu1 + x + 1] += tmp1[row0 + x];
+				result[rowu1 + x + 0] += tmp0[row0 + x];
+				result[rowu1 + x - 1] += tmp1[row0 + x];
+			}
+			result[rowu1 + xsize - 1] += tmp0[row0 + xsize - 1];
+			result[rowu1 + xsize - 2] += tmp1[row0 + xsize - 1];
+		}
+	}
+	*diffs = result;
+	ScaleImageOpt(scale, diffs);
+}
+
+void DiffPrecomputeOpt(
+	const std::vector<std::vector<float> > &xyb0,
+	const std::vector<std::vector<float> > &xyb1,
+	size_t xsize, size_t ysize,
+	std::vector<std::vector<float> > *mask) {
+	PROFILER_FUNC;
+	mask->resize(3, std::vector<float>(xyb0[0].size()));
+	float valsh0[3] = { 0.0 };
+	float valsv0[3] = { 0.0 };
+	float valsh1[3] = { 0.0 };
+	float valsv1[3] = { 0.0 };
+	int ix2;
+	for (size_t y = 0; y < ysize; ++y) {
+		for (size_t x = 0; x < xsize; ++x) {
+			size_t ix = x + xsize * y;
+			if (x + 1 < xsize) {
+				ix2 = ix + 1;
+			}
+			else {
+				ix2 = ix - 1;
+			}
+			{
+				float x0 = (xyb0[0][ix] - xyb0[0][ix2]);
+				float y0 = (xyb0[1][ix] - xyb0[1][ix2]);
+				float z0 = (xyb0[2][ix] - xyb0[2][ix2]);
+				XybToValsOpt(x0, y0, z0, &valsh0[0], &valsh0[1], &valsh0[2]);
+				float x1 = (xyb1[0][ix] - xyb1[0][ix2]);
+				float y1 = (xyb1[1][ix] - xyb1[1][ix2]);
+				float z1 = (xyb1[2][ix] - xyb1[2][ix2]);
+				XybToValsOpt(x1, y1, z1, &valsh1[0], &valsh1[1], &valsh1[2]);
+			}
+			if (y + 1 < ysize) {
+				ix2 = ix + xsize;
+			}
+			else {
+				ix2 = ix - xsize;
+			}
+			{
+				float x0 = (xyb0[0][ix] - xyb0[0][ix2]);
+				float y0 = (xyb0[1][ix] - xyb0[1][ix2]);
+				float z0 = (xyb0[2][ix] - xyb0[2][ix2]);
+				XybToValsOpt(x0, y0, z0, &valsv0[0], &valsv0[1], &valsv0[2]);
+				float x1 = (xyb1[0][ix] - xyb1[0][ix2]);
+				float y1 = (xyb1[1][ix] - xyb1[1][ix2]);
+				float z1 = (xyb1[2][ix] - xyb1[2][ix2]);
+				XybToValsOpt(x1, y1, z1, &valsv1[0], &valsv1[1], &valsv1[2]);
+			}
+			for (int i = 0; i < 3; ++i) {
+				float sup0 = fabs(valsh0[i]) + fabs(valsv0[i]);
+				float sup1 = fabs(valsh1[i]) + fabs(valsv1[i]);
+				float m = std::min(sup0, sup1);
+				(*mask)[i][ix] = static_cast<float>(m);
+			}
+		}
+	}
+}
+
+void MaskOpt(const std::vector<std::vector<float> > &xyb0,
+	const std::vector<std::vector<float> > &xyb1,
+	size_t xsize, size_t ysize,
+	std::vector<std::vector<float> > *mask,
+	std::vector<std::vector<float> > *mask_dc) {
+	PROFILER_FUNC;
+	mask->resize(3);
+	for (int i = 0; i < 3; ++i) {
+		(*mask)[i].resize(xsize * ysize);
+	}
+	DiffPrecomputeOpt(xyb0, xyb1, xsize, ysize, mask);
+	for (int i = 0; i < 3; ++i) {
+		_Average5x5(xsize, ysize, &(*mask)[i]);
+		MinSquareValOpt(4, 0, xsize, ysize, (*mask)[i].data());
+		static const float sigma[3] = {
+			9.65781083553,
+			14.2644604355,
+			4.53358927369,
+		};
+		BlurOpt(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0);
+	}
+	static const float w00 = 232.206464018;
+	static const float w11 = 22.9455222245;
+	static const float w22 = 503.962310606;
+
+	mask_dc->resize(3);
+	for (int i = 0; i < 3; ++i) {
+		(*mask_dc)[i].resize(xsize * ysize);
+	}
+	for (size_t y = 0; y < ysize; ++y) {
+		for (size_t x = 0; x < xsize; ++x) {
+			const size_t idx = y * xsize + x;
+			const float s0 = (*mask)[0][idx];
+			const float s1 = (*mask)[1][idx];
+			const float s2 = (*mask)[2][idx];
+			const float p0 = w00 * s0;
+			const float p1 = w11 * s1;
+			const float p2 = w22 * s2;
+
+			(*mask)[0][idx] = static_cast<float>(MaskXOpt(p0));
+			(*mask)[1][idx] = static_cast<float>(MaskYOpt(p1));
+			(*mask)[2][idx] = static_cast<float>(MaskBOpt(p2));
+			(*mask_dc)[0][idx] = static_cast<float>(MaskDcXOpt(p0));
+			(*mask_dc)[1][idx] = static_cast<float>(MaskDcYOpt(p1));
+			(*mask_dc)[2][idx] = static_cast<float>(MaskDcBOpt(p2));
+		}
+	}
+	for (int i = 0; i < 3; ++i) {
+		ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask)[i]);
+		ScaleImageOpt(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
+	}
+}
+
+}
+
 namespace butteraugli
 {
     clButteraugliComparator::clButteraugliComparator(size_t xsize, size_t ysize, int step)
@@ -29,12 +1285,17 @@ namespace butteraugli
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
 #endif
+		else if (MODE_CPU_OPT == g_mathMode)
+		{
+			DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result);
+		}
         else
         {
             ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result);
         }
     }
 
+
     void clButteraugliComparator::BlockDiffMap(const std::vector<std::vector<float> > &xyb0,
         const std::vector<std::vector<float> > &xyb1,
         std::vector<float>* block_diff_dc,
@@ -50,8 +1311,7 @@ namespace butteraugli
                 (*block_diff_dc).data(), (*block_diff_ac).data());
         }
     }
-
-
+	
     void clButteraugliComparator::EdgeDetectorMap(const std::vector<std::vector<float> > &xyb0,
         const std::vector<std::vector<float> > &xyb1,
         std::vector<float>* edge_detector_map)
@@ -109,6 +1369,186 @@ namespace butteraugli
         }
     }
 
+	void clButteraugliComparator::DiffmapOpsinDynamicsImageOpt(
+		std::vector<std::vector<float>> &xyb0,
+		std::vector<std::vector<float>> &xyb1,
+		std::vector<float> &result)
+	{
+		if (xsize_ < 8 || ysize_ < 8) return;
+		{
+			auto xyb0_c = xyb0;
+			auto xyb1_c = xyb1;
+			MaskHighIntensityChangeOpt(xsize_, ysize_, xyb0_c, xyb1_c, xyb0, xyb1);
+		}
+		assert(8 <= xsize_);
+		for (int i = 0; i < 3; i++) {
+			assert(xyb0[i].size() == num_pixels_);
+			assert(xyb1[i].size() == num_pixels_);
+		}
+		std::vector<float> edge_detector_map(3 * res_xsize_ * res_ysize_);
+		EdgeDetectorMapOpt(xyb0, xyb1, &edge_detector_map);
+		std::vector<float> block_diff_dc(3 * res_xsize_ * res_ysize_);
+		std::vector<float> block_diff_ac(3 * res_xsize_ * res_ysize_);
+		BlockDiffMapOpt(xyb0, xyb1, &block_diff_dc, &block_diff_ac);
+		EdgeDetectorLowFreqOpt(xyb0, xyb1, &block_diff_ac);
+		{
+			std::vector<std::vector<float> > mask_xyb(3);
+			std::vector<std::vector<float> > mask_xyb_dc(3);
+			MaskOpt(xyb0, xyb1, xsize_, ysize_, &mask_xyb, &mask_xyb_dc);
+			CombineChannelsOpt(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac,
+				edge_detector_map, &result);
+		}
+		CalculateDiffmapOpt(xsize_, ysize_, step_, &result);
+	}
+
+	void clButteraugliComparator::BlockDiffMapOpt(const std::vector<std::vector<float> > &xyb0,
+		const std::vector<std::vector<float> > &xyb1,
+		std::vector<float>* block_diff_dc,
+		std::vector<float>* block_diff_ac)
+	{
+		for (size_t res_y = 0; res_y + (kBlockEdge - step_ - 1) < ysize_;
+			res_y += step_) {
+			for (size_t res_x = 0; res_x + (kBlockEdge - step_ - 1) < xsize_;
+				res_x += step_) {
+				size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
+				size_t offset = (std::min(res_y, ysize_ - 8) * xsize_ +
+					std::min(res_x, xsize_ - 8));
+				float block0[3 * kBlockEdge * kBlockEdge];
+				float block1[3 * kBlockEdge * kBlockEdge];
+				for (int i = 0; i < 3; ++i) {
+					float *m0 = &block0[i * kBlockEdge * kBlockEdge];
+					float *m1 = &block1[i * kBlockEdge * kBlockEdge];
+					for (size_t y = 0; y < kBlockEdge; y++) {
+						for (size_t x = 0; x < kBlockEdge; x++) {
+							m0[kBlockEdge * y + x] = xyb0[i][offset + y * xsize_ + x];
+							m1[kBlockEdge * y + x] = xyb1[i][offset + y * xsize_ + x];
+						}
+					}
+				}
+				float diff_xyb_dc[3] = { 0.0 };
+				float diff_xyb_ac[3] = { 0.0 };
+				float diff_xyb_edge_dc[3] = { 0.0 };
+				ButteraugliBlockDiffOpt(block0, block1,
+					diff_xyb_dc, diff_xyb_ac, diff_xyb_edge_dc);
+				for (int i = 0; i < 3; ++i) {
+					(*block_diff_dc)[3 * res_ix + i] = static_cast<float>(diff_xyb_dc[i]);
+					(*block_diff_ac)[3 * res_ix + i] = static_cast<float>(diff_xyb_ac[i]);
+				}
+			}
+		}
+	}
+
+	void clButteraugliComparator::EdgeDetectorMapOpt(const std::vector<std::vector<float> > &xyb0,
+		const std::vector<std::vector<float> > &xyb1,
+		std::vector<float>* edge_detector_map)
+	{
+		static const float kSigma[3] = {
+			1.5,
+			0.586,
+			0.4,
+		};
+		std::vector<std::vector<float> > blurred0(xyb0);
+		std::vector<std::vector<float> > blurred1(xyb1);
+		for (int i = 0; i < 3; i++) {
+			BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma[i], 0.0);
+			BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma[i], 0.0);
+		}
+		for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
+			for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) {
+				size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
+				float diff_xyb[3] = { 0.0 };
+				Butteraugli8x8CornerEdgeDetectorDiffOpt(std::min(res_x, xsize_ - 8),
+					std::min(res_y, ysize_ - 8),
+					xsize_, ysize_,
+					blurred0, blurred1,
+					diff_xyb);
+				for (int i = 0; i < 3; ++i) {
+					(*edge_detector_map)[3 * res_ix + i] = static_cast<float>(diff_xyb[i]);
+				}
+			}
+		}
+	}
+
+	void clButteraugliComparator::EdgeDetectorLowFreqOpt(const std::vector<std::vector<float> > &xyb0,
+		const std::vector<std::vector<float> > &xyb1,
+		std::vector<float>* block_diff_ac)
+	{
+		static const float kSigma = 14;
+		static const float kMul = 10;
+		std::vector<std::vector<float> > blurred0(xyb0);
+		std::vector<std::vector<float> > blurred1(xyb1);
+		for (int i = 0; i < 3; i++) {
+			BlurOpt(xsize_, ysize_, blurred0[i].data(), kSigma, 0.0);
+			BlurOpt(xsize_, ysize_, blurred1[i].data(), kSigma, 0.0);
+		}
+		const int step = 8;
+		for (size_t y = 0; y + step < ysize_; y += step_) {
+			int resy = y / step_;
+			int resx = step / step_;
+			for (size_t x = 0; x + step < xsize_; x += step_, resx++) {
+				const int ix = y * xsize_ + x;
+				const int res_ix = resy * res_xsize_ + resx;
+				float diff[4][3];
+				for (int i = 0; i < 3; ++i) {
+					int ix2 = ix + 8;
+					diff[0][i] =
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+					ix2 = ix + 8 * xsize_;
+					diff[1][i] =
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+					ix2 = ix + 6 * xsize_ + 6;
+					diff[2][i] =
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+					ix2 = ix + 6 * xsize_ - 6;
+					diff[3][i] = x < step ? 0 :
+						((blurred1[i][ix] - blurred0[i][ix]) +
+						(blurred0[i][ix2] - blurred1[i][ix2]));
+				}
+				float max_diff_xyb[3] = { 0 };
+				for (int k = 0; k < 4; ++k) {
+					float diff_xyb[3] = { 0 };
+					XybDiffLowFreqSquaredAccumulateOpt(diff[k][0], diff[k][1], diff[k][2],
+						0, 0, 0, 1.0,
+						diff_xyb);
+					for (int i = 0; i < 3; ++i) {
+						max_diff_xyb[i] = std::max<float>(max_diff_xyb[i], diff_xyb[i]);
+					}
+				}
+				for (int i = 0; i < 3; ++i) {
+					(*block_diff_ac)[3 * res_ix + i] += static_cast<float>(kMul * max_diff_xyb[i]);
+				}
+			}
+		}
+	}
+
+	void clButteraugliComparator::CombineChannelsOpt(const std::vector<std::vector<float> >& mask_xyb,
+		const std::vector<std::vector<float> >& mask_xyb_dc,
+		const std::vector<float>& block_diff_dc,
+		const std::vector<float>& block_diff_ac,
+		const std::vector<float>& edge_detector_map,
+		std::vector<float>* result)
+	{
+		result->resize(res_xsize_ * res_ysize_);
+		for (size_t res_y = 0; res_y + (8 - step_) < ysize_; res_y += step_) {
+			for (size_t res_x = 0; res_x + (8 - step_) < xsize_; res_x += step_) {
+				size_t res_ix = (res_y * res_xsize_ + res_x) / step_;
+				float mask[3];
+				float dc_mask[3];
+				for (int i = 0; i < 3; ++i) {
+					mask[i] = mask_xyb[i][(res_y + 3) * xsize_ + (res_x + 3)];
+					dc_mask[i] = mask_xyb_dc[i][(res_y + 3) * xsize_ + (res_x + 3)];
+				}
+				(*result)[res_ix] = static_cast<float>(
+					DotProductOpt(&block_diff_dc[3 * res_ix], dc_mask) +
+					DotProductOpt(&block_diff_ac[3 * res_ix], mask) +
+					DotProductOpt(&edge_detector_map[3 * res_ix], mask));
+			}
+		}
+	}
+
     void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) 
     {
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
@@ -119,7 +1559,7 @@ namespace butteraugli
             _MinSquareVal(square_size, offset, xsize, ysize, values);
             tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
         }
-        else
+		else
         {
             _MinSquareVal(square_size, offset, xsize, ysize, values);
         }
@@ -189,6 +1629,10 @@ namespace butteraugli
             );
         }
 #endif
+		else if (MODE_CPU_OPT == g_mathMode)
+		{
+			MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+		}
         else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
@@ -227,16 +1671,23 @@ namespace butteraugli
         std::vector<std::vector<float> > &xyb0,
         std::vector<std::vector<float> > &xyb1)
     {
-        _MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
-
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
+			_MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
             tclMaskHighIntensityChange(c0[0].data(), c0[1].data(), c0[2].data(),
                 c1[0].data(), c1[1].data(), c1[2].data(),
                 xsize, ysize,
                 xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
         }
+		else if (MODE_CPU_OPT == g_mathMode)
+		{
+			MaskHighIntensityChangeOpt(xsize, ysize, c0, c1, xyb0, xyb1);
+		}
+		else
+		{
+			_MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
+		}
     }
 
     void ScaleImage(double scale, std::vector<float> *result)
@@ -314,7 +1765,11 @@ namespace butteraugli
             tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), 
                     xsize, ysize,
                     rgb[0].data(), rgb[1].data(), rgb[2].data());
-        }  
+        } 
+		else if (MODE_CPU_OPT == g_mathMode)
+		{
+			OpsinDynamicsImageOpt(xsize, ysize, rgb);
+		}
         else
         {
             _OpsinDynamicsImage(xsize, ysize, rgb);
diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h
index 23204047..c26de1de 100644
--- a/clguetzli/clbutter_comparator.h
+++ b/clguetzli/clbutter_comparator.h
@@ -15,26 +15,49 @@ namespace butteraugli {
             std::vector<std::vector<float>> &xyb1,
             std::vector<float> &result);
 
+		virtual void DiffmapOpsinDynamicsImageOpt(std::vector<std::vector<float>> &xyb0,
+			std::vector<std::vector<float>> &xyb1,
+			std::vector<float> &result);
+
         virtual void BlockDiffMap(const std::vector<std::vector<float> > &rgb0,
             const std::vector<std::vector<float> > &rgb1,
             std::vector<float>* block_diff_dc,
             std::vector<float>* block_diff_ac);
 
+		virtual void BlockDiffMapOpt(const std::vector<std::vector<float> > &rgb0,
+			const std::vector<std::vector<float> > &rgb1,
+			std::vector<float>* block_diff_dc,
+			std::vector<float>* block_diff_ac);
 
         virtual void EdgeDetectorMap(const std::vector<std::vector<float> > &rgb0,
             const std::vector<std::vector<float> > &rgb1,
             std::vector<float>* edge_detector_map);
 
+		virtual void EdgeDetectorMapOpt(const std::vector<std::vector<float> > &rgb0,
+			const std::vector<std::vector<float> > &rgb1,
+			std::vector<float>* edge_detector_map);
+
         virtual void EdgeDetectorLowFreq(const std::vector<std::vector<float> > &rgb0,
             const std::vector<std::vector<float> > &rgb1,
             std::vector<float>* block_diff_ac);
 
+		virtual void EdgeDetectorLowFreqOpt(const std::vector<std::vector<float> > &rgb0,
+			const std::vector<std::vector<float> > &rgb1,
+			std::vector<float>* block_diff_ac);
+
         virtual void CombineChannels(const std::vector<std::vector<float> >& scale_xyb,
             const std::vector<std::vector<float> >& scale_xyb_dc,
             const std::vector<float>& block_diff_dc,
             const std::vector<float>& block_diff_ac,
             const std::vector<float>& edge_detector_map,
             std::vector<float>* result);
+
+		virtual void CombineChannelsOpt(const std::vector<std::vector<float> >& scale_xyb,
+			const std::vector<std::vector<float> >& scale_xyb_dc,
+			const std::vector<float>& block_diff_dc,
+			const std::vector<float>& block_diff_ac,
+			const std::vector<float>& edge_detector_map,
+			std::vector<float>* result);
     };
 
     void _MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values);
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 7bd566df..b04d6cc1 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -61,7 +61,6 @@ namespace guetzli
 
     void ButteraugliComparatorEx::Compare(const OutputImage& img)
     {
-
         if (MODE_OPENCL == g_mathMode)
         {
             std::vector<std::vector<float> > rgb1(3, std::vector<float>(width_ * height_));
@@ -124,11 +123,21 @@ namespace guetzli
             distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
         }
 #endif
-        else
+        else if (MODE_CPU_OPT == g_mathMode)
         {
-            ButteraugliComparator::Compare(img);
+			std::vector<std::vector<float> > rgb0 = rgb_orig_opsin;
+
+			std::vector<std::vector<float> > rgb(3, std::vector<float>(width_ * height_));
+			img.ToLinearRGB(&rgb);
+			::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
+			std::vector<float>().swap(distmap_);
+			comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_);
+			distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
         }
-
+		else
+		{
+			ButteraugliComparator::Compare(img);
+		}
     }
 
     void ButteraugliComparatorEx::StartBlockComparisons()
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index d5c04492..d25f8c80 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -14,6 +14,7 @@
 enum MATH_MODE
 {
     MODE_CPU = 0,
+	MODE_CPU_OPT,
     MODE_OPENCL,
     MODE_CUDA,
     MODE_CHECKCL,
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index 63bc4ed1..e5a335c6 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -228,6 +228,7 @@ void Usage() {
       "                 the limit. Default limit is %d MB.\n"
 	  "  --opencl     - Use OpenCL\n"
       "  --checkcl    - Check OpenCL result\n"
+	  "  --c          - Use c opt version\n"
 #ifdef __USE_CUDA__
 	  "  --cuda       - Use CUDA\n"	 
       "  --checkcuda  - Check CUDA result\n"
@@ -270,6 +271,10 @@ int main(int argc, char** argv) {
     else if (!strcmp(argv[opt_idx], "--checkcl")) {
         g_mathMode = MODE_CHECKCL;
     }
+	else if (!strcmp(argv[opt_idx], "--c"))
+	{
+		g_mathMode = MODE_CPU_OPT;
+	}
 #ifdef __USE_CUDA__
 	else if (!strcmp(argv[opt_idx], "--cuda")) {
 		g_mathMode = MODE_CUDA;
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 432c62f5..a16fcc36 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -409,53 +409,54 @@ void Processor::ComputeBlockZeroingOrder(
   memcpy(processed_block, block, sizeof(processed_block));
   comparator_->SwitchBlock(block_x, block_y, factor_x, factor_y);
   while (!input_order.empty()) {
-    float best_err = 1e17f;
-    int best_i = 0;
-    for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
-                                         input_order.size());
-         ++i) {
-      coeff_t candidate_block[kBlockSize];
-      memcpy(candidate_block, processed_block, sizeof(candidate_block));
-      const int idx = input_order[i].first;
-      candidate_block[idx] = 0;
-      for (int c = 0; c < 3; ++c) {
-        if (comp_mask & (1 << c)) {
-          img->component(c).SetCoeffBlock(
-              block_x, block_y, &candidate_block[c * kDCTBlockSize]);
-        }
-      }
-      float max_err = 0;
-      for (int iy = 0; iy < factor_y; ++iy) {
-        for (int ix = 0; ix < factor_x; ++ix) {
-          int block_xx = block_x * factor_x + ix;
-          int block_yy = block_y * factor_y + iy;
-          if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
-            float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask));
-            max_err = std::max(max_err, err);
-          }
-        }
-      }
-      if (max_err < best_err) {
-        best_err = max_err;
-        best_i = i;
-      }
-    }
-    int idx = input_order[best_i].first;
-    processed_block[idx] = 0;
-    input_order.erase(input_order.begin() + best_i);
-    output_order->push_back({idx, best_err});
-    for (int c = 0; c < 3; ++c) {
-      if (comp_mask & (1 << c)) {
-        img->component(c).SetCoeffBlock(
-            block_x, block_y, &processed_block[c * kDCTBlockSize]);
-      }
-    }
-#ifdef __USE_C__
-    if (best_err >= comparator_->BlockErrorLimit())
-    {   // err������������ģ���������Ѿ�����ErrorLimit�������ļ�������������
-        break;
-    }
-#endif
+	  float best_err = 1e17f;
+	  int best_i = 0;
+	  for (size_t i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
+		  input_order.size());
+		  ++i) {
+		  coeff_t candidate_block[kBlockSize];
+		  memcpy(candidate_block, processed_block, sizeof(candidate_block));
+		  const int idx = input_order[i].first;
+		  candidate_block[idx] = 0;
+		  for (int c = 0; c < 3; ++c) {
+			  if (comp_mask & (1 << c)) {
+				  img->component(c).SetCoeffBlock(
+					  block_x, block_y, &candidate_block[c * kDCTBlockSize]);
+			  }
+		  }
+		  float max_err = 0;
+		  for (int iy = 0; iy < factor_y; ++iy) {
+			  for (int ix = 0; ix < factor_x; ++ix) {
+				  int block_xx = block_x * factor_x + ix;
+				  int block_yy = block_y * factor_y + iy;
+				  if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
+					  float err = static_cast<float>(comparator_->CompareBlock(*img, ix, iy, candidate_block, comp_mask));
+					  max_err = std::max(max_err, err);
+				  }
+			  }
+		  }
+		  if (max_err < best_err) {
+			  best_err = max_err;
+			  best_i = i;
+		  }
+	  }
+	  int idx = input_order[best_i].first;
+	  processed_block[idx] = 0;
+	  input_order.erase(input_order.begin() + best_i);
+	  output_order->push_back({ idx, best_err });
+	  for (int c = 0; c < 3; ++c) {
+		  if (comp_mask & (1 << c)) {
+			  img->component(c).SetCoeffBlock(
+				  block_x, block_y, &processed_block[c * kDCTBlockSize]);
+		  }
+	  }
+	  if (MODE_CPU_OPT == g_mathMode)
+	  {
+		  if (best_err >= comparator_->BlockErrorLimit())
+		  {   // err������������ģ���������Ѿ�����ErrorLimit�������ļ�������������
+			  break;
+		  }
+	  }
   }
   // Make the block error values monotonic.
   float min_err = 1e10;
@@ -622,7 +623,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
 #endif
     }
 
-    if (MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode)
+    if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode)
     {
         output_order_cpu.resize(num_blocks * kBlockSize);
         output_order = output_order_cpu.data();

From b67b00d19e7d3f7e7099752fa25a733880b5d629 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Mon, 19 Jun 2017 21:28:17 +0800
Subject: [PATCH 164/189] Modify the flag for creating CUDA context

---
 clguetzli/ocu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 48f2768a..fddf4e20 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -19,7 +19,7 @@ ocu_args_d_t& getOcu(void)
     CUcontext ctxt;
     CUstream  stream;
 
-    err = cuCtxCreate(&ctxt, CU_CTX_SCHED_BLOCKING_SYNC, dev);
+    err = cuCtxCreate(&ctxt, CU_CTX_SCHED_AUTO, dev);
     LOG_CU_RESULT(err);
 
     char name[1024];

From c1bc10cba3d87f1c6646cff732dec83a1fec9802 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 20 Jun 2017 19:41:07 +0800
Subject: [PATCH 165/189] Add macro for opencl version

---
 clguetzli/clbutter_comparator.cpp | 99 ++++++++++++++++++++-----------
 clguetzli/clguetzli.cl.cpp        | 32 ++++++----
 clguetzli/clguetzli.cpp           |  8 ++-
 clguetzli/clguetzli.h             | 24 ++++----
 clguetzli/clguetzli_test.cpp      |  6 +-
 clguetzli/ocl.cpp                 |  6 +-
 clguetzli/ocl.h                   |  3 +
 guetzli/guetzli.cc                | 13 ++++
 guetzli/processor.cc              | 25 +++++++-
 9 files changed, 151 insertions(+), 65 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 58e76e54..e39966b1 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -1271,12 +1271,18 @@ namespace butteraugli
         std::vector<std::vector<float>> &xyb1,
         std::vector<float> &result)
     {
-        if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100)
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode && xsize_ > 100 && ysize_ > 100)
         {
             result.resize(xsize_ * ysize_);
             clDiffmapOpsinDynamicsImage(result.data(), xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
+#endif
 #ifdef __USE_CUDA__
         else if (MODE_CUDA == g_mathMode && xsize_ > 100 && ysize_ > 100)
         {
@@ -1285,10 +1291,6 @@ namespace butteraugli
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data(), xsize_, ysize_, step_);
         }
 #endif
-		else if (MODE_CPU_OPT == g_mathMode)
-		{
-			DiffmapOpsinDynamicsImageOpt(xyb0, xyb1, result);
-		}
         else
         {
             ButteraugliComparator::DiffmapOpsinDynamicsImage(xyb0, xyb1, result);
@@ -1302,7 +1304,7 @@ namespace butteraugli
         std::vector<float>* block_diff_ac)
     {
         ButteraugliComparator::BlockDiffMap(xyb0, xyb1, block_diff_dc, block_diff_ac);
-
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
             tclBlockDiffMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
@@ -1310,6 +1312,7 @@ namespace butteraugli
                 xsize_, ysize_, step_,
                 (*block_diff_dc).data(), (*block_diff_ac).data());
         }
+#endif
     }
 	
     void clButteraugliComparator::EdgeDetectorMap(const std::vector<std::vector<float> > &xyb0,
@@ -1317,7 +1320,7 @@ namespace butteraugli
         std::vector<float>* edge_detector_map)
     {
         ButteraugliComparator::EdgeDetectorMap(xyb0, xyb1, edge_detector_map);
-
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
             tclEdgeDetectorMap(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
@@ -1325,12 +1328,14 @@ namespace butteraugli
                 xsize_, ysize_, step_, 
                 (*edge_detector_map).data());
         }
+#endif
     }
 
     void clButteraugliComparator::EdgeDetectorLowFreq(const std::vector<std::vector<float> > &xyb0,
         const std::vector<std::vector<float> > &xyb1,
         std::vector<float>* block_diff_ac)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
             std::vector<float> orign_ac = *block_diff_ac;
@@ -1341,6 +1346,7 @@ namespace butteraugli
                 orign_ac.data(), (*block_diff_ac).data());
         }
         else
+#endif
         {
             ButteraugliComparator::EdgeDetectorLowFreq(xyb0, xyb1, block_diff_ac);
         }
@@ -1353,6 +1359,7 @@ namespace butteraugli
         const std::vector<float>& edge_detector_map,
         std::vector<float>* result)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize_ > 8 && ysize_ > 8)
         {
             std::vector<float> temp = *result;
@@ -1364,6 +1371,7 @@ namespace butteraugli
                 block_diff_ac.data(), edge_detector_map.data(), xsize_, ysize_, res_xsize_, res_ysize_, step_, &temp[0], &(*result)[0]);
         }
         else
+#endif
         {
             ButteraugliComparator::CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac, edge_detector_map, result);
         }
@@ -1551,6 +1559,7 @@ namespace butteraugli
 
     void MinSquareVal(size_t square_size, size_t offset, size_t xsize, size_t ysize, float *values) 
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             std::vector<float> img;
@@ -1560,6 +1569,7 @@ namespace butteraugli
             tclMinSquareVal(img.data(), square_size, offset, xsize, ysize, values);
         }
 		else
+#endif
         {
             _MinSquareVal(square_size, offset, xsize, ysize, values);
         }
@@ -1567,6 +1577,7 @@ namespace butteraugli
 
     void Average5x5(int xsize, int ysize, std::vector<float>* diffs)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             std::vector<float> diffs_org = *diffs;
@@ -1574,6 +1585,7 @@ namespace butteraugli
             tclAverage5x5(xsize, ysize, diffs_org, *diffs);
         }
         else
+#endif
         {
             _Average5x5(xsize, ysize, diffs);
         }
@@ -1583,10 +1595,12 @@ namespace butteraugli
     {
         _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
 
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             tclDiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
         }
+#endif
     }
 
     void Mask(const std::vector<std::vector<float> > &xyb0,
@@ -1595,7 +1609,12 @@ namespace butteraugli
         std::vector<std::vector<float> > *mask,
         std::vector<std::vector<float> > *mask_dc)
     {
-        if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
         {
             mask->resize(3);
             mask_dc->resize(3);
@@ -1611,6 +1630,16 @@ namespace butteraugli
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data()
                 );
         }
+		else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
+		{
+			_Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
+			tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
+				xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
+				xsize, ysize,
+				(*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
+				(*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
+		}
+#endif
 #ifdef __USE_CUDA__
         else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100)
         {
@@ -1629,19 +1658,6 @@ namespace butteraugli
             );
         }
 #endif
-		else if (MODE_CPU_OPT == g_mathMode)
-		{
-			MaskOpt(xyb0, xyb1, xsize, ysize, mask, mask_dc);
-		}
-        else if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
-        {
-            _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
-            tclMask(xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
-                xyb1[0].data(), xyb1[1].data(), xyb1[2].data(),
-                xsize, ysize,
-                (*mask)[0].data(), (*mask)[1].data(), (*mask)[2].data(),
-                (*mask_dc)[0].data(), (*mask_dc)[1].data(), (*mask_dc)[2].data());
-        }
         else
         {
             _Mask(xyb0, xyb1, xsize, ysize, mask, mask_dc);
@@ -1652,6 +1668,7 @@ namespace butteraugli
         const size_t step,
         std::vector<float>* diffmap)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             std::vector<float> diffmap_org = *diffmap;
@@ -1659,6 +1676,7 @@ namespace butteraugli
             tclCalculateDiffmap(xsize, ysize, step, diffmap_org.data(), diffmap_org.size(), (*diffmap).data());
         }
         else
+#endif
         {
             _CalculateDiffmap(xsize, ysize, step, diffmap);
         }
@@ -1671,6 +1689,7 @@ namespace butteraugli
         std::vector<std::vector<float> > &xyb0,
         std::vector<std::vector<float> > &xyb1)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
 			_MaskHighIntensityChange(xsize, ysize, c0, c1, xyb0, xyb1);
@@ -1680,7 +1699,9 @@ namespace butteraugli
                 xyb0[0].data(), xyb0[1].data(), xyb0[2].data(),
                 xyb1[0].data(), xyb1[1].data(), xyb1[2].data());
         }
-		else if (MODE_CPU_OPT == g_mathMode)
+		else
+#endif
+		if (MODE_CPU_OPT == g_mathMode)
 		{
 			MaskHighIntensityChangeOpt(xsize, ysize, c0, c1, xyb0, xyb1);
 		}
@@ -1692,6 +1713,7 @@ namespace butteraugli
 
     void ScaleImage(double scale, std::vector<float> *result)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && result->size() > 64)
         {
             std::vector<float> result_org = *result;
@@ -1699,6 +1721,7 @@ namespace butteraugli
             tclScaleImage(scale, result_org.data(), (*result).data(), (*result).size());
         }
         else
+#endif
         {
             _ScaleImage(scale, result);
         }
@@ -1714,15 +1737,18 @@ namespace butteraugli
     {
         _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
 
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
         }
+#endif
     }
 
     void Blur(size_t xsize, size_t ysize, float* channel, double sigma,
         double border_ratio)
     {
+#ifdef __USE_OPENCL__
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             std::vector<float> orignChannel;
@@ -1732,6 +1758,7 @@ namespace butteraugli
             tclBlur(orignChannel.data(), xsize, ysize, sigma, border_ratio, channel);
         }
         else
+#endif
         {
             _Blur(xsize, ysize, channel, sigma, border_ratio);
         }
@@ -1740,7 +1767,12 @@ namespace butteraugli
     void OpsinDynamicsImage(size_t xsize, size_t ysize,
         std::vector<std::vector<float> > &rgb)
     {
-        if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			OpsinDynamicsImageOpt(xsize, ysize, rgb);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode && xsize > 100 && ysize > 100)
         {
             float * r = rgb[0].data();
             float * g = rgb[1].data();
@@ -1748,6 +1780,15 @@ namespace butteraugli
 
             clOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
+		else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8)
+		{
+			std::vector< std::vector<float>> orig_rgb = rgb;
+			_OpsinDynamicsImage(xsize, ysize, rgb);
+			tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(),
+				xsize, ysize,
+				rgb[0].data(), rgb[1].data(), rgb[2].data());
+	}
+#endif
 #ifdef __USE_CUDA__
         else if (MODE_CUDA == g_mathMode && xsize > 100 && ysize > 100)
         {
@@ -1758,18 +1799,6 @@ namespace butteraugli
             cuOpsinDynamicsImage(r, g, b, xsize, ysize);
         }
 #endif
-        else if (MODE_CHECKCL == g_mathMode && xsize > 8 & ysize > 8)
-        {
-            std::vector< std::vector<float>> orig_rgb = rgb;
-            _OpsinDynamicsImage(xsize, ysize, rgb);
-            tclOpsinDynamicsImage(orig_rgb[0].data(), orig_rgb[1].data(), orig_rgb[2].data(), 
-                    xsize, ysize,
-                    rgb[0].data(), rgb[1].data(), rgb[2].data());
-        } 
-		else if (MODE_CPU_OPT == g_mathMode)
-		{
-			OpsinDynamicsImageOpt(xsize, ysize, rgb);
-		}
         else
         {
             _OpsinDynamicsImage(xsize, ysize, rgb);
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index b04d6cc1..5b382ae6 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -3,6 +3,8 @@
 #include <vector>
 #include "utils.h"
 
+#ifdef __USE_OPENCL__
+
 using namespace std;
 
 int g_idvec[10] = { 0 };
@@ -61,7 +63,19 @@ namespace guetzli
 
     void ButteraugliComparatorEx::Compare(const OutputImage& img)
     {
-        if (MODE_OPENCL == g_mathMode)
+		if (MODE_CPU_OPT == g_mathMode)
+		{
+			std::vector<std::vector<float> > rgb0 = rgb_orig_opsin;
+
+			std::vector<std::vector<float> > rgb(3, std::vector<float>(width_ * height_));
+			img.ToLinearRGB(&rgb);
+			::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
+			std::vector<float>().swap(distmap_);
+			comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_);
+			distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
+		}
+#ifdef __USE_OPENCL__
+        else if (MODE_OPENCL == g_mathMode)
         {
             std::vector<std::vector<float> > rgb1(3, std::vector<float>(width_ * height_));
             img.ToLinearRGB(&rgb1);
@@ -92,6 +106,7 @@ namespace guetzli
 
             distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
         }
+#endif
 #ifdef __USE_CUDA__
         else if (MODE_CUDA == g_mathMode)
         {
@@ -123,17 +138,6 @@ namespace guetzli
             distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
         }
 #endif
-        else if (MODE_CPU_OPT == g_mathMode)
-        {
-			std::vector<std::vector<float> > rgb0 = rgb_orig_opsin;
-
-			std::vector<std::vector<float> > rgb(3, std::vector<float>(width_ * height_));
-			img.ToLinearRGB(&rgb);
-			::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
-			std::vector<float>().swap(distmap_);
-			comparator_.DiffmapOpsinDynamicsImage(rgb0, rgb, distmap_);
-			distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
-        }
 		else
 		{
 			ButteraugliComparator::Compare(img);
@@ -239,4 +243,6 @@ namespace guetzli
 */
         return err;
     }
-}
\ No newline at end of file
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index 8f39fb46..be8e8c10 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -4,12 +4,14 @@
 #include <vector>
 #include "cl.hpp"
 
+extern MATH_MODE g_mathMode = MODE_CPU;
+
+#ifdef __USE_OPENCL__
+
 #ifdef __USE_DOUBLE_AS_FLOAT__
 #define double float
 #endif
 
-extern MATH_MODE g_mathMode = MODE_CPU;
-
 void clOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, const size_t ysize)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -827,4 +829,6 @@ void clCalculateDiffmapEx(cl_mem diffmap/*in,out*/, const size_t xsize, const si
 
 #ifdef __USE_DOUBLE_AS_FLOAT__
 #undef double
+#endif
+
 #endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index d25f8c80..c01da7a4 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -7,22 +7,24 @@
 
 #include "cuguetzli.h"
 
-#ifdef __USE_DOUBLE_AS_FLOAT__
-#define double float
-#endif
-
 enum MATH_MODE
 {
-    MODE_CPU = 0,
+	MODE_CPU = 0,
 	MODE_CPU_OPT,
-    MODE_OPENCL,
-    MODE_CUDA,
-    MODE_CHECKCL,
-    MODE_CHECKCUDA
+	MODE_OPENCL,
+	MODE_CUDA,
+	MODE_CHECKCL,
+	MODE_CHECKCUDA
 };
 
 extern MATH_MODE g_mathMode;
 
+#ifdef __USE_OPENCL__
+
+#ifdef __USE_DOUBLE_AS_FLOAT__
+#define double float
+#endif
+
 void clOpsinDynamicsImage(
     float *r, float *g, float *b, 
     const size_t xsize, const size_t ysize);
@@ -174,4 +176,6 @@ namespace guetzli {
         std::vector<float> imgMaskXyzScaleBlockList;    // [RGBRGB..RGBRGB]:blockCount
         std::vector<std::vector<float>> rgb_orig_opsin;
     };
-}
\ No newline at end of file
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 9cb4007d..b5fa50c5 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -7,6 +7,8 @@
 #include "ocl.h"
 #include "ocu.h"
 
+#ifdef __USE_OPENCL__
+
 #define FLOAT_COMPARE(a, b, c)  floatCompare((a), (b), (c), __FUNCTION__, __LINE__ )
 
 int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line)
@@ -446,4 +448,6 @@ void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_
 	err = clFinish(ocl.commandQueue);
 
 	ocl.releaseMemChannels(rgb);
-}
\ No newline at end of file
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index 639ad68e..f4427fff 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -2,6 +2,8 @@
 #include <string.h>
 #include <vector>
 
+#ifdef __USE_OPENCL__
+
 ocl_args_d_t& getOcl(void)
 {
     static bool bInit = false;
@@ -543,4 +545,6 @@ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
 	}
 
 	return CL_SUCCESS;
-}
\ No newline at end of file
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index a9573fa6..f182bb88 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -4,6 +4,8 @@
 #include "utils.h"
 #include "clguetzli.cl.h"
 
+#ifdef __USE_OPENCL__
+
 // Macros for OpenCL versions
 #define OPENCL_VERSION_1_2  1.2f
 #define OPENCL_VERSION_2_0  2.0f
@@ -61,3 +63,4 @@ struct ocl_args_d_t
 	float            compilerVersion;   // hold the device OpenCL C version (default. 1.2)
 };
 
+#endif
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
index e5a335c6..c972d391 100644
--- a/guetzli/guetzli.cc
+++ b/guetzli/guetzli.cc
@@ -29,6 +29,9 @@
 #include "guetzli/quality.h"
 #include "guetzli/stats.h"
 #include "clguetzli/clguetzli.h"
+#ifdef __USE_GPERFTOOLS__
+#include <google/profiler.h>
+#endif
 
 namespace {
 
@@ -226,8 +229,10 @@ void Usage() {
       "                 Default value is %d.\n"
       "  --memlimit M - Memory limit in MB. Guetzli will fail if unable to stay under\n"
       "                 the limit. Default limit is %d MB.\n"
+#ifdef __USE_OPENCL__
 	  "  --opencl     - Use OpenCL\n"
       "  --checkcl    - Check OpenCL result\n"
+#endif
 	  "  --c          - Use c opt version\n"
 #ifdef __USE_CUDA__
 	  "  --cuda       - Use CUDA\n"	 
@@ -240,6 +245,9 @@ void Usage() {
 }  // namespace
 
 int main(int argc, char** argv) {
+#ifdef __USE_GPERFTOOLS__
+	ProfilerStart("guetzli.prof");
+#endif
   std::set_terminate(TerminateHandler);
 
   int verbose = 0;
@@ -265,12 +273,14 @@ int main(int argc, char** argv) {
     } else if (!strcmp(argv[opt_idx], "--nomemlimit")) {
       memlimit_mb = -1;
 	}
+#ifdef __USE_OPENCL__
 	else if (!strcmp(argv[opt_idx], "--opencl")) {
 		g_mathMode = MODE_OPENCL;
 	}
     else if (!strcmp(argv[opt_idx], "--checkcl")) {
         g_mathMode = MODE_CHECKCL;
     }
+#endif
 	else if (!strcmp(argv[opt_idx], "--c"))
 	{
 		g_mathMode = MODE_CPU_OPT;
@@ -351,5 +361,8 @@ int main(int argc, char** argv) {
   }
 
   WriteFileOrDie(argv[opt_idx + 1], out_data);
+#ifdef __USE_GPERFTOOLS__
+  ProfilerStop();
+#endif
   return 0;
 }
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index a16fcc36..3d39da02 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -569,11 +569,14 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
 
     std::vector<CoeffData> output_order_gpu;
     std::vector<CoeffData> output_order_cpu;
-    CoeffData * output_order = NULL;
-    ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
+
+	CoeffData * output_order = NULL;
 
     if (MODE_OPENCL == g_mathMode || MODE_CUDA == g_mathMode)
     {
+#ifdef __USE_OPENCL__
+		ButteraugliComparatorEx * comp = (ButteraugliComparatorEx*)comparator_;
+
         channel_info orig_channel[3];
         channel_info mayout_channel[3];
 
@@ -606,6 +609,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
                 comp_mask,
                 comp->BlockErrorLimit());
         }
+#endif
 #ifdef __USE_CUDA__
         else
         {
@@ -622,8 +626,11 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         }
 #endif
     }
-
+#ifdef __USE_OPENCL__
     if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode || MODE_CHECKCL == g_mathMode)
+#else
+	if (MODE_CPU_OPT == g_mathMode || MODE_CPU == g_mathMode)
+#endif
     {
         output_order_cpu.resize(num_blocks * kBlockSize);
         output_order = output_order_cpu.data();
@@ -1038,9 +1045,15 @@ bool Process(const Params& params, ProcessStats* stats,
   }
   std::unique_ptr<ButteraugliComparator> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
+#ifdef __USE_OPENCL__
     comparator.reset(
         new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
+#else
+	comparator.reset(
+		new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+			params.butteraugli_target, stats));
+#endif
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);
   *jpg_out = out.jpeg_data;
@@ -1062,9 +1075,15 @@ bool Process(const Params& params, ProcessStats* stats,
   }
   std::unique_ptr<ButteraugliComparator> comparator;
   if (jpg.width >= 32 && jpg.height >= 32) {
+#ifdef __USE_OPENCL__
     comparator.reset(
         new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
+#else
+	  comparator.reset(
+		new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+			params.butteraugli_target, stats));
+#endif
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);
   *jpg_out = out.jpeg_data;

From 66a8d9f0644371d62f47720f5a7332b4fde2a1fa Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Wed, 21 Jun 2017 17:28:03 +0800
Subject: [PATCH 166/189] Add simple cuda memory pool

---
 clguetzli/clguetzli.cl.cpp |  14 +-
 clguetzli/cuguetzli.cpp    | 342 ++++++++++++++++++-------------------
 clguetzli/ocu.cpp          |  20 +--
 clguetzli/ocu.h            |   5 +
 guetzli.vcxproj            |   9 +-
 guetzli.vcxproj.filters    |   6 +
 6 files changed, 204 insertions(+), 192 deletions(-)

diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 5b382ae6..f29a283c 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -119,11 +119,11 @@ namespace guetzli
             distmap_.resize(xsize * ysize);
 
             size_t channel_size = xsize * ysize * sizeof(float);
-            ocu_args_d_t &ocl = getOcu();
-            ocu_channels xyb0 = ocl.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data());
-            ocu_channels xyb1 = ocl.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data());
+            ocu_args_d_t &ocu = getOcu();
+            ocu_channels xyb0 = ocu.allocMemChannels(channel_size, rgb_orig_opsin[0].data(), rgb_orig_opsin[1].data(), rgb_orig_opsin[2].data());
+            ocu_channels xyb1 = ocu.allocMemChannels(channel_size, rgb1[0].data(), rgb1[1].data(), rgb1[2].data());
             
-            cu_mem mem_result = ocl.allocMem(channel_size);
+            cu_mem mem_result = ocu.allocMem(channel_size);
 
             cuOpsinDynamicsImageEx(xyb1, xsize, ysize);
 
@@ -131,9 +131,9 @@ namespace guetzli
 
             cuMemcpyDtoH(distmap_.data(), mem_result, channel_size);
 
-            cuMemFree(mem_result);
-            ocl.releaseMemChannels(xyb0);
-            ocl.releaseMemChannels(xyb1);
+            ocu.releaseMem(mem_result);
+            ocu.releaseMemChannels(xyb0);
+            ocu.releaseMemChannels(xyb1);
 
             distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap_);
         }
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 3b8c2835..1903c6eb 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -18,17 +18,17 @@ void cuOpsinDynamicsImage(float *r, float *g, float *b, const size_t xsize, cons
 {
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
 
     cuOpsinDynamicsImageEx(rgb, xsize, ysize);
 
-    cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocl.commandQueue);
-    cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocl.commandQueue);
-	cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocl.commandQueue);
-    cuFinish(ocl.commandQueue);
+    cuMemcpyDtoHAsync(r, rgb.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(g, rgb.g, channel_size, ocu.commandQueue);
+	cuMemcpyDtoHAsync(b, rgb.b, channel_size, ocu.commandQueue);
+    cuFinish(ocu.commandQueue);
 
-    ocl.releaseMemChannels(rgb);
+    ocu.releaseMemChannels(rgb);
 }
 
 void cuDiffmapOpsinDynamicsImage(
@@ -40,20 +40,20 @@ void cuDiffmapOpsinDynamicsImage(
 {
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels xyb0 = ocl.allocMemChannels(channel_size, r, g, b);
-    ocu_channels xyb1 = ocl.allocMemChannels(channel_size, r2, g2, b2);
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels xyb0 = ocu.allocMemChannels(channel_size, r, g, b);
+    ocu_channels xyb1 = ocu.allocMemChannels(channel_size, r2, g2, b2);
 
-    cu_mem mem_result = ocl.allocMem(channel_size, result);
+    cu_mem mem_result = ocu.allocMem(channel_size, result);
 
     cuDiffmapOpsinDynamicsImageEx(mem_result, xyb0, xyb1, xsize, ysize, step);
 
     cuMemcpyDtoH(result, mem_result, channel_size);
 
-    ocl.releaseMemChannels(xyb1);
-    ocl.releaseMemChannels(xyb0);
+    ocu.releaseMemChannels(xyb1);
+    ocu.releaseMemChannels(xyb0);
 
-    cuMemFree(mem_result);
+    ocu.releaseMem(mem_result);
 }
 
 void cuComputeBlockZeroingOrder(
@@ -75,7 +75,7 @@ void cuComputeBlockZeroingOrder(
 
     using namespace guetzli;
 
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     cu_mem mem_orig_coeff[3];
     cu_mem mem_mayout_coeff[3];
@@ -83,20 +83,20 @@ void cuComputeBlockZeroingOrder(
     for (int c = 0; c < 3; c++)
     {
         int block_count = orig_channel[c].block_width * orig_channel[c].block_height;
-        mem_orig_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
+        mem_orig_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, orig_channel[c].coeff);
 
         block_count = mayout_channel[c].block_width * mayout_channel[c].block_height;
-        mem_mayout_coeff[c] = ocl.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
+        mem_mayout_coeff[c] = ocu.allocMem(block_count * sizeof(::coeff_t) * kDCTBlockSize, mayout_channel[c].coeff);
 
-        mem_mayout_pixel[c] = ocl.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
+        mem_mayout_pixel[c] = ocu.allocMem(image_width * image_height * sizeof(uint16_t), mayout_channel[c].pixel);
     }
-    cu_mem mem_orig_image = ocl.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
-    cu_mem mem_mask_scale = ocl.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
+    cu_mem mem_orig_image = ocu.allocMem(sizeof(float) * 3 * kDCTBlockSize * block8_width * block8_height, orig_image_batch);
+    cu_mem mem_mask_scale = ocu.allocMem(sizeof(float) * 3 * block8_width * block8_height, mask_scale);
 
     int output_order_batch_size = sizeof(CoeffData) * 3 * kDCTBlockSize * blockf_width * blockf_height;
-    cu_mem mem_output_order_batch = ocl.allocMem(output_order_batch_size, output_order_batch);
+    cu_mem mem_output_order_batch = ocu.allocMem(output_order_batch_size, output_order_batch);
 
-    CUfunction kernel = ocl.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
+    CUfunction kernel = ocu.kernel[KERNEL_COMPUTEBLOCKZEROINGORDER];
     const void *args[] = { &mem_orig_coeff[0], &mem_orig_coeff[1], &mem_orig_coeff[2],
         &mem_orig_image, &mem_mask_scale,
         &blockf_width, &blockf_height,
@@ -113,24 +113,24 @@ void cuComputeBlockZeroingOrder(
         BLOCK_COUNT_X(blockf_width), BLOCK_COUNT_Y(blockf_height), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
     LOG_CU_RESULT(err);
 
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
     cuMemcpyDtoH(output_order_batch, mem_output_order_batch, output_order_batch_size);
 
     for (int c = 0; c < 3; c++)
     {
-        cuMemFree(mem_orig_coeff[c]);
-        cuMemFree(mem_mayout_coeff[c]);
-        cuMemFree(mem_mayout_pixel[c]);
+        ocu.releaseMem(mem_orig_coeff[c]);
+        ocu.releaseMem(mem_mayout_coeff[c]);
+        ocu.releaseMem(mem_mayout_pixel[c]);
     }
 
-    cuMemFree(mem_orig_image);
-    cuMemFree(mem_mask_scale);
-    cuMemFree(mem_output_order_batch);
+    ocu.releaseMem(mem_orig_image);
+    ocu.releaseMem(mem_mask_scale);
+    ocu.releaseMem(mem_output_order_batch);
 }
 
 void cuMask(
@@ -140,29 +140,29 @@ void cuMask(
     const float* r, const float* g, const float* b,
     const float* r2, const float* g2, const float* b2)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_channels rgb = ocl.allocMemChannels(channel_size, r, g, b);
-    ocu_channels rgb2 = ocl.allocMemChannels(channel_size, r2, g2, b2);
-    ocu_channels mask = ocl.allocMemChannels(channel_size);
-    ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb = ocu.allocMemChannels(channel_size, r, g, b);
+    ocu_channels rgb2 = ocu.allocMemChannels(channel_size, r2, g2, b2);
+    ocu_channels mask = ocu.allocMemChannels(channel_size);
+    ocu_channels mask_dc = ocu.allocMemChannels(channel_size);
 
     cuMaskEx(mask, mask_dc, rgb, rgb2, xsize, ysize);
 
-    cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocl.commandQueue);
-    cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocl.commandQueue);
-    cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocl.commandQueue);
-    cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocl.commandQueue);
-    cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocl.commandQueue);
-    cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocl.commandQueue);
-    cuFinish(ocl.commandQueue);
-
-    ocl.releaseMemChannels(rgb);
-    ocl.releaseMemChannels(rgb2);
-    ocl.releaseMemChannels(mask);
-    ocl.releaseMemChannels(mask_dc);
+    cuMemcpyDtoHAsync(mask_r, mask.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(mask_g, mask.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(mask_b, mask.b, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_r, mask_dc.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_g, mask_dc.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoHAsync(maskdc_b, mask_dc.b, channel_size, ocu.commandQueue);
+    cuFinish(ocu.commandQueue);
+
+    ocu.releaseMemChannels(rgb);
+    ocu.releaseMemChannels(rgb2);
+    ocu.releaseMemChannels(mask);
+    ocu.releaseMemChannels(mask_dc);
 }
 
 void cuDiffmapOpsinDynamicsImageEx(
@@ -178,11 +178,11 @@ void cuDiffmapOpsinDynamicsImageEx(
     size_t channel_size = xsize * ysize * sizeof(float);
     size_t channel_step_size = res_xsize * res_ysize * sizeof(float);
 
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
  
-    cu_mem edge_detector_map = ocl.allocMem(3 * channel_step_size);
-    cu_mem block_diff_dc = ocl.allocMem(3 * channel_step_size);
-    cu_mem block_diff_ac = ocl.allocMem(3 * channel_step_size);
+    cu_mem edge_detector_map = ocu.allocMem(3 * channel_step_size);
+    cu_mem block_diff_dc = ocu.allocMem(3 * channel_step_size);
+    cu_mem block_diff_ac = ocu.allocMem(3 * channel_step_size);
 
     cuMaskHighIntensityChangeEx(xyb0, xyb1, xsize, ysize);
 
@@ -190,20 +190,20 @@ void cuDiffmapOpsinDynamicsImageEx(
     cuBlockDiffMapEx(block_diff_dc, block_diff_ac, xyb0, xyb1, xsize, ysize, step);
     cuEdgeDetectorLowFreqEx(block_diff_ac, xyb0, xyb1, xsize, ysize, step);
     {
-        ocu_channels mask = ocl.allocMemChannels(channel_size);
-        ocu_channels mask_dc = ocl.allocMemChannels(channel_size);
+        ocu_channels mask = ocu.allocMemChannels(channel_size);
+        ocu_channels mask_dc = ocu.allocMemChannels(channel_size);
         cuMaskEx(mask, mask_dc, xyb0, xyb1, xsize, ysize);
         cuCombineChannelsEx(result, mask, mask_dc, xsize, ysize, block_diff_dc, block_diff_ac, edge_detector_map, res_xsize, step);
 
-        ocl.releaseMemChannels(mask);
-        ocl.releaseMemChannels(mask_dc);
+        ocu.releaseMemChannels(mask);
+        ocu.releaseMemChannels(mask_dc);
     }
 
     cuCalculateDiffmapEx(result, xsize, ysize, step);
 
-    cuMemFree(edge_detector_map);
-    cuMemFree(block_diff_dc);
-    cuMemFree(block_diff_ac);
+    ocu.releaseMem(edge_detector_map);
+    ocu.releaseMem(block_diff_dc);
+    ocu.releaseMem(block_diff_ac);
 }
 
 void cuConvolutionEx(
@@ -212,20 +212,20 @@ void cuConvolutionEx(
     const cu_mem multipliers, size_t len,
     int xstep, int offset, float border_ratio)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     size_t oxsize = (xsize + xstep - 1) / xstep;
 
-	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTION];
+	CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTION];
     const void *args[] = { &result, &inp, &xsize, &multipliers, &len, &xstep, &offset, &border_ratio };
 
     CUresult err = cuLaunchKernel(kernel,
         oxsize, ysize, 1,
         1, 1, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -236,18 +236,18 @@ void cuConvolutionXEx(
     const cu_mem multipliers, size_t len,
     int xstep, int offset, float border_ratio)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONX];
+	CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONX];
     const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -257,18 +257,18 @@ void cuConvolutionYEx(
     const cu_mem multipliers, size_t len,
     int xstep, int offset, float border_ratio)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-	CUfunction kernel = ocl.kernel[KERNEL_CONVOLUTIONY];
+	CUfunction kernel = ocu.kernel[KERNEL_CONVOLUTIONY];
     const void *args[] = { &result, &xsize, &ysize, &inp, &multipliers, &len, &xstep, &offset, &border_ratio };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -277,18 +277,18 @@ void cuSquareSampleEx(
     const cu_mem image, size_t xsize, size_t ysize,
     size_t xstep, size_t ystep)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-	CUfunction kernel = ocl.kernel[KERNEL_SQUARESAMPLE];
+	CUfunction kernel = ocu.kernel[KERNEL_SQUARESAMPLE];
     const void *args[] = { &result, &xsize, &ysize, &image, &xstep, &ystep };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -308,26 +308,26 @@ void cuBlurEx(cu_mem image/*out, opt*/, const size_t xsize, const size_t ysize,
 
     const int xstep = std::max<int>(1, int(sigma / 3));
 
-    ocu_args_d_t &ocl = getOcu();
-    cu_mem mem_expn = ocl.allocMem(sizeof(cl_float) * expn_size, expn.data());
+    ocu_args_d_t &ocu = getOcu();
+    cu_mem mem_expn = ocu.allocMem(sizeof(cl_float) * expn_size, expn.data());
 
     if (xstep > 1)
     {
-        cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+        cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
         cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
         cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
         cuSquareSampleEx(result ? result : image, result ? result : image, xsize, ysize, xstep, xstep);
-        cuMemFree(m);
+        ocu.releaseMem(m);
     }
     else
     {
-        cu_mem m = ocl.allocMem(sizeof(cl_float) * xsize * ysize);
+        cu_mem m = ocu.allocMem(sizeof(cl_float) * xsize * ysize);
         cuConvolutionXEx(m, image, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
         cuConvolutionYEx(result ? result : image, m, xsize, ysize, mem_expn, expn_size, xstep, diff, border_ratio);
-        cuMemFree(m);
+        ocu.releaseMem(m);
     }
 
-    cuMemFree(mem_expn);
+    ocu.releaseMem(mem_expn);
 }
 
 void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t ysize)
@@ -336,8 +336,8 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
 
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels rgb_blurred = ocl.allocMemChannels(channel_size);
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb_blurred = ocu.allocMemChannels(channel_size);
 
     const int size = xsize * ysize;
 
@@ -345,7 +345,7 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
     cuBlurEx(rgb.g, xsize, ysize, kSigma, 0.0, rgb_blurred.g);
     cuBlurEx(rgb.b, xsize, ysize, kSigma, 0.0, rgb_blurred.b);
 
-	CUfunction kernel = ocl.kernel[KERNEL_OPSINDYNAMICSIMAGE];
+	CUfunction kernel = ocu.kernel[KERNEL_OPSINDYNAMICSIMAGE];
     const void *args[] = { &rgb.r, &rgb.g, &rgb.b, &size, &rgb_blurred.r, &rgb_blurred.g, &rgb_blurred.b };
 
     CUresult err = cuLaunchKernel(kernel,
@@ -354,12 +354,12 @@ void cuOpsinDynamicsImageEx(ocu_channels &rgb, const size_t xsize, const size_t
         (size + 511) / 512, 1, 1,
         512, 1, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
-    ocl.releaseMemChannels(rgb_blurred);
+    ocu.releaseMemChannels(rgb_blurred);
 }
 
 void cuMaskHighIntensityChangeEx(
@@ -369,20 +369,20 @@ void cuMaskHighIntensityChangeEx(
 {
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-    ocu_channels c0 = ocl.allocMemChannels(channel_size);
-    ocu_channels c1 = ocl.allocMemChannels(channel_size);
+    ocu_channels c0 = ocu.allocMemChannels(channel_size);
+    ocu_channels c1 = ocu.allocMemChannels(channel_size);
 
-    cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocl.commandQueue);
-    cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocl.commandQueue);
-    cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocl.commandQueue);
-    cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocl.commandQueue);
-    cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocl.commandQueue);
-    cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocl.commandQueue);
-	cuFinish(ocl.commandQueue);
+    cuMemcpyDtoDAsync(c0.r, xyb0.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c0.g, xyb0.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c0.b, xyb0.b, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c1.r, xyb1.r, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c1.g, xyb1.g, channel_size, ocu.commandQueue);
+    cuMemcpyDtoDAsync(c1.b, xyb1.b, channel_size, ocu.commandQueue);
+	cuFinish(ocu.commandQueue);
 
-	CUfunction kernel = ocl.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
+	CUfunction kernel = ocu.kernel[KERNEL_MASKHIGHINTENSITYCHANGE];
     const void *args[] = { 
 		&xyb0.r, &xyb0.g, &xyb0.b,
         &xsize, &ysize,
@@ -394,13 +394,13 @@ void cuMaskHighIntensityChangeEx(
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
-    ocl.releaseMemChannels(c0);
-    ocl.releaseMemChannels(c1);
+    ocu.releaseMemChannels(c0);
+    ocu.releaseMemChannels(c1);
 }
 
 void cuEdgeDetectorMapEx(
@@ -410,10 +410,10 @@ void cuEdgeDetectorMapEx(
 {
     size_t channel_size = xsize * ysize * sizeof(float);
 
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
-    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+    ocu_channels rgb_blured = ocu.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size);
 
     static const double kSigma[3] = { 1.5, 0.586, 0.4 };
 
@@ -426,7 +426,7 @@ void cuEdgeDetectorMapEx(
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-	CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTOR];
+	CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTOR];
     const void *args[] = { &result,
         &res_xsize, &res_ysize,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
@@ -437,13 +437,13 @@ void cuEdgeDetectorMapEx(
         BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
-    ocl.releaseMemChannels(rgb_blured);
-    ocl.releaseMemChannels(rgb2_blured);
+    ocu.releaseMemChannels(rgb_blured);
+    ocu.releaseMemChannels(rgb2_blured);
 }
 
 void cuBlockDiffMapEx(
@@ -452,12 +452,12 @@ void cuBlockDiffMapEx(
     const ocu_channels &rgb, const ocu_channels &rgb2,
     const size_t xsize, const size_t ysize, const size_t step)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-	CUfunction kernel = ocl.kernel[KERNEL_BLOCKDIFFMAP];
+	CUfunction kernel = ocu.kernel[KERNEL_BLOCKDIFFMAP];
     const void *args[] = { &block_diff_dc, &block_diff_ac,
         &res_xsize, &res_ysize,
         &rgb.r, &rgb.g, &rgb.b,
@@ -468,9 +468,9 @@ void cuBlockDiffMapEx(
         BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -483,9 +483,9 @@ void cuEdgeDetectorLowFreqEx(
 
     static const double kSigma = 14;
 
-    ocu_args_d_t &ocl = getOcu();
-    ocu_channels rgb_blured = ocl.allocMemChannels(channel_size);
-    ocu_channels rgb2_blured = ocl.allocMemChannels(channel_size);
+    ocu_args_d_t &ocu = getOcu();
+    ocu_channels rgb_blured = ocu.allocMemChannels(channel_size);
+    ocu_channels rgb2_blured = ocu.allocMemChannels(channel_size);
 
     for (int i = 0; i < 3; i++)
     {
@@ -496,7 +496,7 @@ void cuEdgeDetectorLowFreqEx(
     const size_t res_xsize = (xsize + step - 1) / step;
     const size_t res_ysize = (ysize + step - 1) / step;
 
-	CUfunction kernel = ocl.kernel[KERNEL_EDGEDETECTORLOWFREQ];
+	CUfunction kernel = ocu.kernel[KERNEL_EDGEDETECTORLOWFREQ];
     const void *args[] = { &block_diff_ac,
         &res_xsize, &res_ysize,
         &rgb_blured.r, &rgb_blured.g, &rgb_blured.b,
@@ -508,13 +508,13 @@ void cuEdgeDetectorLowFreqEx(
         BLOCK_COUNT_X(res_xsize), BLOCK_COUNT_Y(res_ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
-    ocl.releaseMemChannels(rgb_blured);
-    ocl.releaseMemChannels(rgb2_blured);
+    ocu.releaseMemChannels(rgb_blured);
+    ocu.releaseMemChannels(rgb2_blured);
 }
 
 void cuDiffPrecomputeEx(
@@ -522,9 +522,9 @@ void cuDiffPrecomputeEx(
     const ocu_channels &xyb0, const ocu_channels &xyb1,
     const size_t xsize, const size_t ysize)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-	CUfunction kernel = ocl.kernel[KERNEL_DIFFPRECOMPUTE];
+	CUfunction kernel = ocu.kernel[KERNEL_DIFFPRECOMPUTE];
     const void *args[] = { &mask.x, &mask.y, &mask.b,
         &xsize, &ysize,
         &xyb0.x, &xyb0.y, &xyb0.b,
@@ -534,18 +534,18 @@ void cuDiffPrecomputeEx(
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
 void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
     float fw = w;
 
-	CUfunction kernel = ocl.kernel[KERNEL_SCALEIMAGE];
+	CUfunction kernel = ocu.kernel[KERNEL_SCALEIMAGE];
     const void *args[] = { &img, &size, &fw };
 
     CUresult err = cuLaunchKernel(kernel,
@@ -554,9 +554,9 @@ void cuScaleImageEx(cu_mem img/*in, out*/, size_t size, double w)
 //        BLOCK_SIZE_X * BLOCK_SIZE_Y, 1, 1,
         512, 1, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -567,26 +567,26 @@ void cuAverage5x5Ex(cu_mem img/*in,out*/, const size_t xsize, const size_t ysize
         return;
     }
 
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     size_t len = xsize * ysize * sizeof(float);
-    cu_mem img_org = ocl.allocMem(len);
+    cu_mem img_org = ocu.allocMem(len);
 
     cuMemcpyDtoD(img_org, img, len);
 
-	CUfunction kernel = ocl.kernel[KERNEL_AVERAGE5X5];
+	CUfunction kernel = ocu.kernel[KERNEL_AVERAGE5X5];
     const void *args[] = { &img, &xsize, &ysize, &img_org };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
-    cuMemFree(img_org);
+    ocu.releaseMem(img_org);
 }
 
 void cuMinSquareValEx(
@@ -594,23 +594,23 @@ void cuMinSquareValEx(
     const size_t xsize, const size_t ysize,
     const size_t square_size, const size_t offset)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-    cu_mem result = ocl.allocMem(sizeof(float) * xsize * ysize);
+    cu_mem result = ocu.allocMem(sizeof(float) * xsize * ysize);
 
-	CUfunction kernel = ocl.kernel[KERNEL_MINSQUAREVAL];
+	CUfunction kernel = ocu.kernel[KERNEL_MINSQUAREVAL];
     const void *args[] = { &result, &xsize, &ysize, &img, &square_size, &offset };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
     cuMemcpyDtoD(img, result, sizeof(float) * xsize * ysize);
-    cuMemFree(result);
+    ocu.releaseMem(result);
 }
 
 static void MakeMask(double extmul, double extoff,
@@ -629,7 +629,7 @@ static const double kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
 
 void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, size_t xsize, size_t ysize)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     double extmul = 0.975741017749;
     double extoff = -4.25328244168;
@@ -710,10 +710,10 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz
     }
 
     size_t channel_size = 512 * sizeof(double);
-    ocu_channels xyb = ocl.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
-    ocu_channels xyb_dc = ocl.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
+    ocu_channels xyb = ocu.allocMemChannels(channel_size, lut_x, lut_y, lut_b);
+    ocu_channels xyb_dc = ocu.allocMemChannels(channel_size, lut_dcx, lut_dcy, lut_dcb);
 
-	CUfunction kernel = ocl.kernel[KERNEL_DOMASK];
+	CUfunction kernel = ocu.kernel[KERNEL_DOMASK];
     const void *args[] = { &mask.r, &mask.g, &mask.b,
         &xsize, &ysize,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
@@ -724,13 +724,13 @@ void cuDoMask(ocu_channels mask/*in, out*/, ocu_channels mask_dc/*in, out*/, siz
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 
-    ocl.releaseMemChannels(xyb);
-    ocl.releaseMemChannels(xyb_dc);
+    ocu.releaseMemChannels(xyb);
+    ocu.releaseMemChannels(xyb_dc);
 }
 
 void cuMaskEx(
@@ -773,12 +773,12 @@ void cuCombineChannelsEx(
     const size_t res_xsize,
     const size_t step)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     const size_t work_xsize = ((xsize - 8 + step) + step - 1) / step;
     const size_t work_ysize = ((ysize - 8 + step) + step - 1) / step;
 
-	CUfunction kernel = ocl.kernel[KERNEL_COMBINECHANNELS];
+	CUfunction kernel = ocu.kernel[KERNEL_COMBINECHANNELS];
     const void *args[] = { &result,
         &mask.r, &mask.g, &mask.b,
         &mask_dc.r, &mask_dc.g, &mask_dc.b,
@@ -792,19 +792,19 @@ void cuCombineChannelsEx(
         work_xsize, work_ysize, 1,
         1, 1, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
 void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysize, const int step)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
-    cu_mem diffmap_out = ocl.allocMem(xsize * ysize * sizeof(float));
+    cu_mem diffmap_out = ocu.allocMem(xsize * ysize * sizeof(float));
 
-	CUfunction kernel = ocl.kernel[KERNEL_UPSAMPLESQUAREROOT];
+	CUfunction kernel = ocu.kernel[KERNEL_UPSAMPLESQUAREROOT];
     const void *args[] = { &diffmap_out, &diffmap, &xsize, &ysize, &step };
 
     const size_t res_xsize = (xsize + step - 1) / step;
@@ -814,18 +814,18 @@ void cuUpsampleSquareRootEx(cu_mem diffmap, const size_t xsize, const size_t ysi
         res_xsize, res_ysize, 1,
         1, 1, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
     cuMemcpyDtoD(diffmap, diffmap_out, xsize * ysize * sizeof(float));
 
-    cuMemFree(diffmap_out);
+    ocu.releaseMem(diffmap_out);
 }
 
 void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const size_t ysize, const int step)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     int cls = 8 - step;
     int cls2 = (8 - step) / 2;
@@ -833,35 +833,35 @@ void cuRemoveBorderEx(cu_mem out, const cu_mem in, const size_t xsize, const siz
     int out_xsize = xsize - cls;
     int out_ysize = ysize - cls;
 
-	CUfunction kernel = ocl.kernel[KERNEL_REMOVEBORDER];
+	CUfunction kernel = ocu.kernel[KERNEL_REMOVEBORDER];
     const void *args[] = { &out, &out_xsize, &out_ysize, &in, &cls, &cls2 };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(out_xsize), BLOCK_COUNT_Y(out_ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
 void cuAddBorderEx(cu_mem out, size_t xsize, size_t ysize, int step, cu_mem in)
 {
-    ocu_args_d_t &ocl = getOcu();
+    ocu_args_d_t &ocu = getOcu();
 
     int cls = 8 - step;
     int cls2 = (8 - step) / 2;
-	CUfunction kernel = ocl.kernel[KERNEL_ADDBORDER];
+	CUfunction kernel = ocu.kernel[KERNEL_ADDBORDER];
     const void *args[] = { &out, &xsize, &ysize, &cls, &cls2, &in };
 
     CUresult err = cuLaunchKernel(kernel,
         BLOCK_COUNT_X(xsize), BLOCK_COUNT_Y(ysize), 1,
         BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,
         0,
-        ocl.commandQueue, (void**)args, NULL);
+        ocu.commandQueue, (void**)args, NULL);
 	LOG_CU_RESULT(err);
-    err = cuFinish(ocl.commandQueue);
+    err = cuFinish(ocu.commandQueue);
 	LOG_CU_RESULT(err);
 }
 
@@ -876,8 +876,8 @@ void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const si
     const int s = 8 - step;
     int s2 = (8 - step) / 2;
 
-    ocu_args_d_t &ocl = getOcu();
-    cu_mem blurred = ocl.allocMem((xsize - s) * (ysize - s) * sizeof(float));
+    ocu_args_d_t &ocu = getOcu();
+    cu_mem blurred = ocu.allocMem((xsize - s) * (ysize - s) * sizeof(float));
     cuRemoveBorderEx(blurred, diffmap, xsize, ysize, step);
 
     static const double border_ratio = 0.03027655136;
@@ -886,7 +886,7 @@ void cuCalculateDiffmapEx(cu_mem diffmap/*in,out*/, const size_t xsize, const si
     cuAddBorderEx(diffmap, xsize, ysize, step, blurred);
     cuScaleImageEx(diffmap, xsize * ysize, scale);
 
-    cuMemFree(blurred);
+    ocu.releaseMem(blurred);
 }
 
 #ifdef __USE_DOUBLE_AS_FLOAT__
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index fddf4e20..7ebc0ac1 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -108,6 +108,7 @@ else
     ocu.commandQueue = stream;
     ocu.mod = mod;
     ocu.ctxt = ctxt;
+    ocu.mem_pool.commandQueue = ocu.commandQueue;
 
     return ocu;
 }
@@ -125,23 +126,18 @@ ocu_args_d_t::~ocu_args_d_t()
 {
     cuModuleUnload(mod);
     cuCtxDestroy(ctxt);
+    mem_pool.drain();
 //    cuStreamDestroy(commandQueue);
 }
 
 cu_mem ocu_args_d_t::allocMem(size_t s, const void *init)
 {
-    cu_mem mem;
-    cuMemAlloc(&mem, s);
-    if (init)
-    {
-        cuMemcpyHtoDAsync(mem, init, s, commandQueue);
-    }
-    else
-    {
-        cuMemsetD8Async(mem, 0, s, commandQueue);
-    }
+    return mem_pool.allocMem(s, init);
+}
 
-    return mem;
+void ocu_args_d_t::releaseMem(cu_mem mem)
+{
+    mem_pool.releaseMem(mem);
 }
 
 ocu_channels ocu_args_d_t::allocMemChannels(size_t s, const void *c0, const void *c1, const void *c2)
@@ -161,7 +157,7 @@ void ocu_args_d_t::releaseMemChannels(ocu_channels &rgb)
 {
     for (int i = 0; i < 3; i++)
     {
-        cuMemFree(rgb.ch[i]);
+        releaseMem(rgb.ch[i]);
         rgb.ch[i] = NULL;
     }
 }
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index dbc42916..e8697d2d 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -4,6 +4,7 @@
 
 #include <cuda.h>
 #include "ocl.h"
+#include "cumem_pool.h"
 
 #define LOG_CU_RESULT(e)   if (CUDA_SUCCESS != (e)) { LogError("Error: %s:%d returned %s.\n", __FUNCTION__, __LINE__, TranslateCUDAError((e)));}
 
@@ -19,6 +20,7 @@ struct ocu_args_d_t
     ~ocu_args_d_t();
 
     cu_mem allocMem(size_t s, const void *init = NULL);
+    void releaseMem(cu_mem mem);
     ocu_channels allocMemChannels(size_t s, const void *c0 = NULL, const void *c1 = NULL, const void *c2 = NULL);
     void releaseMemChannels(ocu_channels &rgb);
 
@@ -27,6 +29,9 @@ struct ocu_args_d_t
     CUmodule    mod;
     CUcontext   ctxt;
     CUdevice    dev;
+    ocu_mem_pool_t mem_pool;
 };
 
+
+
 #endif
\ No newline at end of file
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 86da4aa7..4fa6af4d 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -144,7 +144,7 @@
       <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>__USE_CUDA__;PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -162,6 +162,9 @@
     <CustomBuild>
       <Message>CUDA CU</Message>
     </CustomBuild>
+    <Intel_OpenCL_Build_Rules>
+      <Device>3</Device>
+    </Intel_OpenCL_Build_Rules>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
@@ -188,7 +191,7 @@
       <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;ENABLE_OPENCL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -206,6 +209,7 @@
     <ClInclude Include="clguetzli\clguetzli.h" />
     <ClInclude Include="clguetzli\clguetzli_test.h" />
     <ClInclude Include="clguetzli\cuguetzli.h" />
+    <ClInclude Include="clguetzli\cumem_pool.h" />
     <ClInclude Include="clguetzli\ocl.h" />
     <ClInclude Include="clguetzli\ocu.h" />
     <ClInclude Include="clguetzli\utils.h" />
@@ -305,6 +309,7 @@
     <ClCompile Include="clguetzli\clguetzli.cpp" />
     <ClCompile Include="clguetzli\clguetzli_test.cpp" />
     <ClCompile Include="clguetzli\cuguetzli.cpp" />
+    <ClCompile Include="clguetzli\cumem_pool.cpp" />
     <ClCompile Include="clguetzli\ocl.cpp" />
     <ClCompile Include="clguetzli\ocu.cpp" />
     <ClCompile Include="clguetzli\utils.cpp" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 38921bde..1cbb6a30 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -318,6 +318,9 @@
     <ClInclude Include="clguetzli\cuguetzli.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="clguetzli\cumem_pool.h">
+      <Filter>clguetzli</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -593,6 +596,9 @@
     <ClCompile Include="clguetzli\cuguetzli.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
+    <ClCompile Include="clguetzli\cumem_pool.cpp">
+      <Filter>clguetzli</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="third_party\libpng\pngwin.def">

From e11a712ec5d6ff9dcc0070cb35a282b8c35dbba9 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Wed, 21 Jun 2017 17:41:12 +0800
Subject: [PATCH 167/189] Add missing files

---
 clguetzli/cumem_pool.cpp | 120 +++++++++++++++++++++++++++++++++++++++
 clguetzli/cumem_pool.h   |  37 ++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 clguetzli/cumem_pool.cpp
 create mode 100644 clguetzli/cumem_pool.h

diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp
new file mode 100644
index 00000000..706ff4ba
--- /dev/null
+++ b/clguetzli/cumem_pool.cpp
@@ -0,0 +1,120 @@
+#include "cumem_pool.h"
+
+#ifdef __USE_CUDA__
+
+bool compare_size(const ocu_mem_block_t& first, const ocu_mem_block_t& second)
+{
+    return (first.size < second.size);
+}
+
+ocu_mem_pool_t::ocu_mem_pool_t()
+    :alloc_count(0)
+{
+
+}
+
+ocu_mem_pool_t::~ocu_mem_pool_t()
+{
+
+}
+
+cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init)
+{
+    alloc_count++;
+    ocu_mem_block_t *block_candidate = NULL;
+    for (std::list<ocu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    {
+        ocu_mem_block_t *block = &(*iter);
+        if (block->status == 0 && block->size >= s) {
+            block_candidate = block;
+            break;
+        }
+    }
+    cu_mem mem = NULL;
+    if (block_candidate != NULL) {
+        block_candidate->status = 1;
+        block_candidate->used = s;
+
+        mem = block_candidate->mem;
+        //LogError("mem_pool reuse mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used);
+    }
+    else {
+        cu_mem new_mem;
+        cuMemAlloc(&new_mem, s);
+        ocu_mem_block_t mem_block;
+        mem_block.size = s;
+        mem_block.used = s;
+        mem_block.mem = new_mem;
+        mem_block.status = 1;
+        mem_pool.push_back(mem_block);
+        mem_pool.sort(compare_size);
+
+        mem = new_mem;
+        //LogError("mem_pool new mem:%lld, used:%lld.\r\n", mem_block.size, mem_block.used);
+    }
+    if (init)
+    {
+        cuMemcpyHtoDAsync(mem, init, s, commandQueue);
+    }
+    else
+    {
+        cuMemsetD8Async(mem, 0, s, commandQueue);
+    }
+
+    return mem;
+
+    //cu_mem mem;
+    //cuMemAlloc(&mem, s);
+    //if (init)
+    //{
+    //    cuMemcpyHtoDAsync(mem, init, s, commandQueue);
+    //}
+    //else
+    //{
+    //    cuMemsetD8Async(mem, 0, s, commandQueue);
+    //}
+
+    //return mem;
+}
+
+void ocu_mem_pool_t::releaseMem(cu_mem mem)
+{
+    ocu_mem_block_t *block_candidate = NULL;
+    for (std::list<ocu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    {
+        ocu_mem_block_t *block = &(*iter);
+        if (block->mem == mem) {
+            block_candidate = block;
+            break;
+        }
+    }
+    if (block_candidate != NULL) {
+        block_candidate->status = 0;
+        block_candidate->used = 0;
+    }
+    else {
+        cuMemFree(mem);
+        LogError("mem_pool release mem:%lld can not be found.\r\n", mem);
+    }
+
+    //LogError("mem_pool release mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used);
+}
+
+void ocu_mem_pool_t::drain()
+{
+    size_t total_mem = 0;
+    size_t total_block = mem_pool.size();
+    ocu_mem_block_t *block_candidate = NULL;
+    for (std::list<ocu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    {
+        if (iter->status == 0) {
+            total_mem += iter->size;
+            cuMemFree(iter->mem);
+            iter = mem_pool.erase(iter);
+        }
+    }
+
+    LogError("mem_pool has %u blocks, and total memory is:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, alloc_count);
+}
+
+#endif
\ No newline at end of file
diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h
new file mode 100644
index 00000000..2abbb69d
--- /dev/null
+++ b/clguetzli/cumem_pool.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#ifdef __USE_CUDA__
+
+#include <list>
+#include <cuda.h>
+#include "ocl.h"
+
+struct ocu_mem_block_t
+{
+    ocu_mem_block_t()
+        :status(0)
+        , used(0)
+    {}
+    ~ocu_mem_block_t()
+    {}
+
+    int status;
+    size_t size;
+    size_t used;
+    cu_mem mem;
+};
+
+struct ocu_mem_pool_t
+{
+    ocu_mem_pool_t();
+    ~ocu_mem_pool_t();
+    cu_mem allocMem(size_t s, const void *init = NULL);
+    void releaseMem(cu_mem mem);
+    void drain();
+
+    std::list<ocu_mem_block_t> mem_pool;
+    CUstream    commandQueue;
+    size_t alloc_count;
+};
+
+#endif
\ No newline at end of file

From 36a3ce62517aad0db5f92fa8c23bb7bea86bd14f Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Wed, 21 Jun 2017 19:26:45 +0800
Subject: [PATCH 168/189] Clean code

---
 clguetzli/cumem_pool.cpp | 53 ++++++++++++++--------------------------
 clguetzli/cumem_pool.h   | 15 ++++++------
 clguetzli/ocu.h          |  2 +-
 3 files changed, 28 insertions(+), 42 deletions(-)

diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp
index 706ff4ba..50d96eb7 100644
--- a/clguetzli/cumem_pool.cpp
+++ b/clguetzli/cumem_pool.cpp
@@ -2,29 +2,31 @@
 
 #ifdef __USE_CUDA__
 
-bool compare_size(const ocu_mem_block_t& first, const ocu_mem_block_t& second)
+bool compare_size(const cu_mem_block_t& first, const cu_mem_block_t& second)
 {
     return (first.size < second.size);
 }
 
-ocu_mem_pool_t::ocu_mem_pool_t()
-    :alloc_count(0)
+cu_mem_pool_t::cu_mem_pool_t()
+    : alloc_count(0)
+    , total_mem_request(0)
 {
 
 }
 
-ocu_mem_pool_t::~ocu_mem_pool_t()
+cu_mem_pool_t::~cu_mem_pool_t()
 {
 
 }
 
-cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init)
+cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init)
 {
     alloc_count++;
-    ocu_mem_block_t *block_candidate = NULL;
-    for (std::list<ocu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    total_mem_request += s;
+    cu_mem_block_t *block_candidate = NULL;
+    for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
     {
-        ocu_mem_block_t *block = &(*iter);
+        cu_mem_block_t *block = &(*iter);
         if (block->status == 0 && block->size >= s) {
             block_candidate = block;
             break;
@@ -36,12 +38,11 @@ cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init)
         block_candidate->used = s;
 
         mem = block_candidate->mem;
-        //LogError("mem_pool reuse mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used);
     }
     else {
         cu_mem new_mem;
         cuMemAlloc(&new_mem, s);
-        ocu_mem_block_t mem_block;
+        cu_mem_block_t mem_block;
         mem_block.size = s;
         mem_block.used = s;
         mem_block.mem = new_mem;
@@ -50,7 +51,6 @@ cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init)
         mem_pool.sort(compare_size);
 
         mem = new_mem;
-        //LogError("mem_pool new mem:%lld, used:%lld.\r\n", mem_block.size, mem_block.used);
     }
     if (init)
     {
@@ -62,27 +62,14 @@ cu_mem ocu_mem_pool_t::allocMem(size_t s, const void *init)
     }
 
     return mem;
-
-    //cu_mem mem;
-    //cuMemAlloc(&mem, s);
-    //if (init)
-    //{
-    //    cuMemcpyHtoDAsync(mem, init, s, commandQueue);
-    //}
-    //else
-    //{
-    //    cuMemsetD8Async(mem, 0, s, commandQueue);
-    //}
-
-    //return mem;
 }
 
-void ocu_mem_pool_t::releaseMem(cu_mem mem)
+void cu_mem_pool_t::releaseMem(cu_mem mem)
 {
-    ocu_mem_block_t *block_candidate = NULL;
-    for (std::list<ocu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    cu_mem_block_t *block_candidate = NULL;
+    for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
     {
-        ocu_mem_block_t *block = &(*iter);
+        cu_mem_block_t *block = &(*iter);
         if (block->mem == mem) {
             block_candidate = block;
             break;
@@ -96,16 +83,14 @@ void ocu_mem_pool_t::releaseMem(cu_mem mem)
         cuMemFree(mem);
         LogError("mem_pool release mem:%lld can not be found.\r\n", mem);
     }
-
-    //LogError("mem_pool release mem:%lld, used:%lld.\r\n", block_candidate->size, block_candidate->used);
 }
 
-void ocu_mem_pool_t::drain()
+void cu_mem_pool_t::drain()
 {
     size_t total_mem = 0;
     size_t total_block = mem_pool.size();
-    ocu_mem_block_t *block_candidate = NULL;
-    for (std::list<ocu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
+    cu_mem_block_t *block_candidate = NULL;
+    for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
     {
         if (iter->status == 0) {
             total_mem += iter->size;
@@ -114,7 +99,7 @@ void ocu_mem_pool_t::drain()
         }
     }
 
-    LogError("mem_pool has %u blocks, and total memory is:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, alloc_count);
+    LogError("mem_pool has %u blocks, and total pool memory is:%f kb, total memory request:%f kb, total alloc count:%d.\r\n", total_block, (float)(total_mem) / 1024, (float)(total_mem_request) / 1024, alloc_count);
 }
 
 #endif
\ No newline at end of file
diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h
index 2abbb69d..73355e82 100644
--- a/clguetzli/cumem_pool.h
+++ b/clguetzli/cumem_pool.h
@@ -6,13 +6,13 @@
 #include <cuda.h>
 #include "ocl.h"
 
-struct ocu_mem_block_t
+struct cu_mem_block_t
 {
-    ocu_mem_block_t()
+    cu_mem_block_t()
         :status(0)
         , used(0)
     {}
-    ~ocu_mem_block_t()
+    ~cu_mem_block_t()
     {}
 
     int status;
@@ -21,17 +21,18 @@ struct ocu_mem_block_t
     cu_mem mem;
 };
 
-struct ocu_mem_pool_t
+struct cu_mem_pool_t
 {
-    ocu_mem_pool_t();
-    ~ocu_mem_pool_t();
+    cu_mem_pool_t();
+    ~cu_mem_pool_t();
     cu_mem allocMem(size_t s, const void *init = NULL);
     void releaseMem(cu_mem mem);
     void drain();
 
-    std::list<ocu_mem_block_t> mem_pool;
+    std::list<cu_mem_block_t> mem_pool;
     CUstream    commandQueue;
     size_t alloc_count;
+    size_t total_mem_request;
 };
 
 #endif
\ No newline at end of file
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index e8697d2d..1c13e86e 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -29,7 +29,7 @@ struct ocu_args_d_t
     CUmodule    mod;
     CUcontext   ctxt;
     CUdevice    dev;
-    ocu_mem_pool_t mem_pool;
+    cu_mem_pool_t mem_pool;
 };
 
 

From e42fdaba25efaf867a1b8bd5c7cc7620ca41b815 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Wed, 21 Jun 2017 17:43:28 +0800
Subject: [PATCH 169/189] Modify makefile

---
 guetzli.make        | 12 ++++++++----
 guetzli_static.make |  4 ++++
 premake5.lua        |  4 ++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/guetzli.make b/guetzli.make
index 3675ba0d..b40f6f4b 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -15,14 +15,14 @@ ifeq ($(config),release)
   TARGETDIR = bin/Release
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
-  DEFINES += -D__USE_CUDA__
+  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL -lcuda
+  LIBS += -lOpenCL -lcuda -lprofiler -lunwind
   LDDEPS +=
   ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
@@ -42,14 +42,14 @@ ifeq ($(config),debug)
   TARGETDIR = bin/Debug
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Debug/guetzli
-  DEFINES += -D__USE_CUDA__
+  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL -lcuda
+  LIBS += -lOpenCL -lcuda -lprofiler -lunwind
   LDDEPS +=
   ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
@@ -70,6 +70,7 @@ OBJECTS := \
 	$(OBJDIR)/clguetzli.o \
 	$(OBJDIR)/clguetzli_test.o \
 	$(OBJDIR)/cuguetzli.o \
+	$(OBJDIR)/cumem_pool.o \
 	$(OBJDIR)/ocl.o \
 	$(OBJDIR)/ocu.o \
 	$(OBJDIR)/utils.o \
@@ -166,6 +167,9 @@ $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
 $(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp
 	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/ocl.o: clguetzli/ocl.cpp
 	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
diff --git a/guetzli_static.make b/guetzli_static.make
index 68808523..2d648c04 100644
--- a/guetzli_static.make
+++ b/guetzli_static.make
@@ -70,6 +70,7 @@ OBJECTS := \
 	$(OBJDIR)/clguetzli.o \
 	$(OBJDIR)/clguetzli_test.o \
 	$(OBJDIR)/cuguetzli.o \
+	$(OBJDIR)/cumem_pool.o \
 	$(OBJDIR)/ocl.o \
 	$(OBJDIR)/ocu.o \
 	$(OBJDIR)/utils.o \
@@ -165,6 +166,9 @@ $(OBJDIR)/clguetzli_test.o: clguetzli/clguetzli_test.cpp
 $(OBJDIR)/cuguetzli.o: clguetzli/cuguetzli.cpp
 	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
+$(OBJDIR)/cumem_pool.o: clguetzli/cumem_pool.cpp
+	@echo $(notdir $<)
+	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
 $(OBJDIR)/ocl.o: clguetzli/ocl.cpp
 	@echo $(notdir $<)
 	$(SILENT) $(CXX) $(ALL_CXXFLAGS) $(FORCE_INCLUDE) -o "$@" -MF "$(@:%.o=%.d)" -c "$<"
diff --git a/premake5.lua b/premake5.lua
index f6723df8..099df66f 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -42,10 +42,10 @@ workspace "guetzli"
   project "guetzli"
     kind "ConsoleApp"
     filter "action:gmake"
-	  defines { "__USE_CUDA__" }
+	  defines { "__USE_OPENCL__", "__USE_CUDA__", "__USE_GPERFTOOLS__" }
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
-      links { "OpenCL", "cuda" }
+      links { "OpenCL", "cuda", "profiler", "unwind" }
     filter "action:vs*"
       links { "shlwapi" }
     filter {}

From 644f5637eda4c55a7177b0e5280d24be4bf588fa Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 23 Jun 2017 15:38:55 +0800
Subject: [PATCH 170/189] =?UTF-8?q?=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AFCUD?=
 =?UTF-8?q?A=20OPENCL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 4fa6af4d..52fda8ba 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -108,7 +108,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -173,7 +173,7 @@
       <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>__USE_CUDA__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -191,7 +191,7 @@
       <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>

From 340d914549cda2cf06f1561b52c975f7ccb93ca1 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Fri, 23 Jun 2017 18:13:38 +0800
Subject: [PATCH 171/189] =?UTF-8?q?=E7=A7=BB=E9=99=A4tcmalloc=EF=BC=8C?=
 =?UTF-8?q?=E5=AF=B9=E6=80=A7=E8=83=BD=E6=B2=A1=E4=BB=80=E4=B9=88=E5=BD=B1?=
 =?UTF-8?q?=E5=93=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj         |  81 -------------
 guetzli.vcxproj.filters | 246 ----------------------------------------
 2 files changed, 327 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 52fda8ba..6d1153c0 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -239,55 +239,6 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h" />
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h" />
     <ClInclude Include="third_party\libpng\png.h" />
     <ClInclude Include="third_party\libpng\pngconf.h" />
     <ClInclude Include="third_party\libpng\pngpriv.h" />
@@ -334,38 +285,6 @@
     <ClCompile Include="guetzli\quantize.cc" />
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc" />
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc" />
     <ClCompile Include="third_party\libpng\png.c" />
     <ClCompile Include="third_party\libpng\pngerror.c" />
     <ClCompile Include="third_party\libpng\pngget.c" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 1cbb6a30..768a8128 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -19,9 +19,6 @@
     <Filter Include="third_party\zlib">
       <UniqueIdentifier>{cb89c1ac-8399-4814-88f2-4b69576bc9f9}</UniqueIdentifier>
     </Filter>
-    <Filter Include="third_party\tcmalloc_minimal">
-      <UniqueIdentifier>{f2b475de-6219-478e-9e5e-08f07ef25dbc}</UniqueIdentifier>
-    </Filter>
     <Filter Include="clguetzli">
       <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
     </Filter>
@@ -147,153 +144,6 @@
     <ClInclude Include="third_party\zlib\zutil.h">
       <Filter>third_party\zlib</Filter>
     </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\addressmap-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\central_freelist.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\common.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\config_for_unittests.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\internal_logging.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\malloc_hook-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\packed-cache-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\page_heap_allocator.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\pagemap.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\raw_printer.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\sampler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\span.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\stacktrace_win32-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\static_vars.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\symbolize.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\system-alloc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\tcmalloc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\thread_cache.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-linuxppc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-macosx.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\basictypes.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\commandlineflags.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\googleinit.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\logging.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_linux-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_posix-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_win32-inl.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\stl_allocator.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\thread_annotations.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\config.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler_types.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\windows\port.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-checker.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_extension.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\profiler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\stacktrace.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\base\atomicops.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\heap-profiler.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\gperftools-gperftools-2.5\src\gperftools\malloc_hook.h">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClInclude>
     <ClInclude Include="clguetzli\utils.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
@@ -476,102 +326,6 @@
     <ClCompile Include="third_party\zlib\zutil.c">
       <Filter>third_party\zlib</Filter>
     </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\central_freelist.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\common.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\fake_stacktrace_scope.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\heap-profile-table.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\internal_logging.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_extension.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\malloc_hook.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\memory_region_map.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\page_heap.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\raw_printer.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\sampler.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\span.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stack_trace_table.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\stacktrace.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\static_vars.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\symbolize.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\thread_cache.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_modrm_map.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\ia32_opcode_map.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\mini_disassembler.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\patch_functions.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\port.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\preamble_patcher_with_stub.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\windows\system-alloc.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\dynamic_annotations.c">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\atomicops-internals-x86.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\logging.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\low_level_alloc.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\spinlock_internal.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\gperftools-gperftools-2.5\src\base\sysinfo.cc">
-      <Filter>third_party\tcmalloc_minimal</Filter>
-    </ClCompile>
     <ClCompile Include="clguetzli\utils.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>

From 6f2726b12008a9c336fdc33501358676c6dee197 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Thu, 29 Jun 2017 20:00:33 +0800
Subject: [PATCH 172/189] Change memory block status to enum

---
 clguetzli/cumem_pool.cpp | 10 +++++-----
 clguetzli/cumem_pool.h   | 10 ++++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp
index 50d96eb7..4fe4964d 100644
--- a/clguetzli/cumem_pool.cpp
+++ b/clguetzli/cumem_pool.cpp
@@ -27,14 +27,14 @@ cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init)
     for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
     {
         cu_mem_block_t *block = &(*iter);
-        if (block->status == 0 && block->size >= s) {
+        if (block->status == MBS_IDLE && block->size >= s) {
             block_candidate = block;
             break;
         }
     }
     cu_mem mem = NULL;
     if (block_candidate != NULL) {
-        block_candidate->status = 1;
+        block_candidate->status = MBS_BUSY;
         block_candidate->used = s;
 
         mem = block_candidate->mem;
@@ -46,7 +46,7 @@ cu_mem cu_mem_pool_t::allocMem(size_t s, const void *init)
         mem_block.size = s;
         mem_block.used = s;
         mem_block.mem = new_mem;
-        mem_block.status = 1;
+        mem_block.status = MBS_BUSY;
         mem_pool.push_back(mem_block);
         mem_pool.sort(compare_size);
 
@@ -76,7 +76,7 @@ void cu_mem_pool_t::releaseMem(cu_mem mem)
         }
     }
     if (block_candidate != NULL) {
-        block_candidate->status = 0;
+        block_candidate->status = MBS_IDLE;
         block_candidate->used = 0;
     }
     else {
@@ -92,7 +92,7 @@ void cu_mem_pool_t::drain()
     cu_mem_block_t *block_candidate = NULL;
     for (std::list<cu_mem_block_t>::iterator iter = mem_pool.begin(); iter != mem_pool.end(); iter++)
     {
-        if (iter->status == 0) {
+        if (iter->status == MBS_IDLE) {
             total_mem += iter->size;
             cuMemFree(iter->mem);
             iter = mem_pool.erase(iter);
diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h
index 73355e82..262f4106 100644
--- a/clguetzli/cumem_pool.h
+++ b/clguetzli/cumem_pool.h
@@ -6,16 +6,22 @@
 #include <cuda.h>
 #include "ocl.h"
 
+enum mem_block_status
+{
+    MBS_IDLE,
+    MBS_BUSY,
+};
+
 struct cu_mem_block_t
 {
     cu_mem_block_t()
-        :status(0)
+        :status(MBS_IDLE)
         , used(0)
     {}
     ~cu_mem_block_t()
     {}
 
-    int status;
+    mem_block_status status;
     size_t size;
     size_t used;
     cu_mem mem;

From 46367ce2986a7977bcc863e7d76f5e71e46011f6 Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Wed, 5 Jul 2017 16:33:04 +0800
Subject: [PATCH 173/189] Remove tcmalloc

---
 guetzli.vcxproj | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 6d1153c0..fb517ca5 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -138,13 +138,13 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -152,7 +152,8 @@
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+      <ForceSymbolReferences>
+      </ForceSymbolReferences>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
@@ -188,20 +189,24 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;PERFTOOLS_DLL_DECL=;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <ForceSymbolReferences>__tcmalloc</ForceSymbolReferences>
+      <ForceSymbolReferences>
+      </ForceSymbolReferences>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent />
+    <Intel_OpenCL_Build_Rules>
+      <Device>3</Device>
+    </Intel_OpenCL_Build_Rules>
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="clguetzli\clbutter_comparator.h" />

From 80319852f00fb8417a649c6dd1889ce5de28d3ba Mon Sep 17 00:00:00 2001
From: zhantong <zhantong1994@163.com>
Date: Fri, 7 Jul 2017 13:57:24 +0800
Subject: [PATCH 174/189] =?UTF-8?q?=E6=94=AF=E6=8C=81=E9=9D=9E=E4=B8=BB?=
 =?UTF-8?q?=E6=B5=81JPEG=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

使用libjpeg库
---
 guetzli.vcxproj              | 12 ++++-----
 guetzli/jpeg_data_decoder.cc |  5 ++--
 guetzli/processor.cc         | 47 ++++++++++++++++++++++++++++++------
 guetzli/processor.h          |  3 +++
 4 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index fb517ca5..ae2e8fc7 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -102,7 +102,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -114,9 +114,9 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
     <CustomBuild>
       <Command>"$(INTELOCLSDKROOT)bin\x64\ioc64.exe" -cmd=build -input="%(FullPath)" -output="x64\Release\%(Filename).out" -VS -device=GPU -simd=default -targetos=current            -bo="           "</Command>
@@ -138,7 +138,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
@@ -150,11 +150,11 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32;C:\Users\tongzhan\GitHub\guetzli\third_party\libjpeg\x86</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
       <Command>
diff --git a/guetzli/jpeg_data_decoder.cc b/guetzli/jpeg_data_decoder.cc
index 98f9f4cc..722d6663 100644
--- a/guetzli/jpeg_data_decoder.cc
+++ b/guetzli/jpeg_data_decoder.cc
@@ -43,9 +43,8 @@ bool HasYCbCrColorSpace(const JPEGData& jpg) {
 }
 
 std::vector<uint8_t> DecodeJpegToRGB(const JPEGData& jpg) {
-  if (jpg.components.size() == 1 ||
-      (jpg.components.size() == 3 &&
-       HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444()))) {
+  if (jpg.components.size() == 3 &&
+       HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444())) {
     OutputImage img(jpg.width, jpg.height);
     img.CopyFromJpegData(jpg);
     return img.ToSRGB();
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index 3d39da02..f0a0bf48 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -33,6 +33,8 @@
 #include "guetzli/quantize.h"
 #include "clguetzli/clguetzli.h"
 
+#include "third_party/libjpeg/jpeglib.h"
+
 namespace guetzli {
 
 namespace {
@@ -1033,10 +1035,7 @@ bool Process(const Params& params, ProcessStats* stats,
   }
   std::vector<uint8_t> rgb = DecodeJpegToRGB(jpg);
   if (rgb.empty()) {
-    fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported "
-            "downsampling mode).\nPlease provide the input image as "
-            "a PNG file.\n");
-    return false;
+    return ProcessUnsupportedJpegData(params,stats,data,jpg_out);
   }
   GuetzliOutput out;
   ProcessStats dummy_stats;
@@ -1050,9 +1049,9 @@ bool Process(const Params& params, ProcessStats* stats,
         new ButteraugliComparatorEx(jpg.width, jpg.height, &rgb,
                                   params.butteraugli_target, stats));
 #else
-	comparator.reset(
-		new ButteraugliComparator(jpg.width, jpg.height, &rgb,
-			params.butteraugli_target, stats));
+   comparator.reset(
+       new ButteraugliComparator(jpg.width, jpg.height, &rgb,
+           params.butteraugli_target, stats));
 #endif
   }
   bool ok = ProcessJpegData(params, jpg, comparator.get(), &out, stats);
@@ -1060,6 +1059,40 @@ bool Process(const Params& params, ProcessStats* stats,
   return ok;
 }
 
+bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats,
+	const std::string& data,
+	std::string* jpg_out) {
+	struct jpeg_decompress_struct cinfo;
+	struct jpeg_error_mgr jerr;
+	cinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_decompress(&cinfo);
+	jpeg_mem_src(&cinfo, (unsigned char*)data.c_str(), data.length());
+
+	int rc = jpeg_read_header(&cinfo, TRUE);
+	if (rc != 1) {
+		fprintf(stderr, "File does not seem to be a normal JPEG\n");
+		exit(EXIT_FAILURE);
+	}
+
+	cinfo.out_color_space = JCS_RGB; //force RGB output
+	jpeg_start_decompress(&cinfo);
+	int xsize = cinfo.output_width;
+	int ysize = cinfo.output_height;
+	int pixel_size = cinfo.output_components;
+	unsigned long bmp_size = xsize * ysize * pixel_size;
+	unsigned char *bmp_buffer = (unsigned char*)malloc(bmp_size);
+	int row_stride = cinfo.output_width * cinfo.output_components;
+	JSAMPARRAY buffer = (*cinfo.mem->alloc_sarray)
+		((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1);
+	while (cinfo.output_scanline < cinfo.output_height) {
+		unsigned char *buffer_array[1];
+		buffer_array[0] = bmp_buffer + (cinfo.output_scanline) * row_stride;
+		jpeg_read_scanlines(&cinfo, buffer_array, 1);
+	}
+	std::vector<uint8_t> temp_rgb(bmp_buffer, bmp_buffer + bmp_size);
+	return Process(params, stats, temp_rgb, xsize, ysize, jpg_out);
+}
+
 bool Process(const Params& params, ProcessStats* stats,
              const std::vector<uint8_t>& rgb, int w, int h,
              std::string* jpg_out) {
diff --git a/guetzli/processor.h b/guetzli/processor.h
index 924ba0fa..e6cf4ba8 100644
--- a/guetzli/processor.h
+++ b/guetzli/processor.h
@@ -53,6 +53,9 @@ struct GuetzliOutput {
 bool ProcessJpegData(const Params& params, const JPEGData& jpg_in,
                      Comparator* comparator, GuetzliOutput* out,
                      ProcessStats* stats);
+bool ProcessUnsupportedJpegData(const Params& params,
+	ProcessStats* stats, const std::string& data,
+	std::string* jpg_out);
 
 // Sets *out to a jpeg encoded string that will decode to an image that is
 // visually indistinguishable from the input rgb image.

From eda913fedda3754a4c7891654c36fe6dd13ca714 Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Sun, 9 Jul 2017 23:30:16 +0800
Subject: [PATCH 175/189] Mofidy makefile

---
 guetzli.make | 4 ++--
 premake5.lua | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/guetzli.make b/guetzli.make
index b40f6f4b..a458eb09 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -22,7 +22,7 @@ ifeq ($(config),release)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL -lcuda -lprofiler -lunwind
+  LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg
   LDDEPS +=
   ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
@@ -49,7 +49,7 @@ ifeq ($(config),debug)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL -lcuda -lprofiler -lunwind
+  LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg
   LDDEPS +=
   ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
diff --git a/premake5.lua b/premake5.lua
index 099df66f..1c5ef6c6 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -31,6 +31,7 @@ workspace "guetzli"
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
         "third_party/butteraugli/butteraugli/butteraugli.h",
+        "third_party/libjpeg/*.h",
         "clguetzli/*.cpp",
         "clguetzli/*.h"
       }
@@ -45,7 +46,7 @@ workspace "guetzli"
 	  defines { "__USE_OPENCL__", "__USE_CUDA__", "__USE_GPERFTOOLS__" }
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
-      links { "OpenCL", "cuda", "profiler", "unwind" }
+      links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" }
     filter "action:vs*"
       links { "shlwapi" }
     filter {}
@@ -55,6 +56,7 @@ workspace "guetzli"
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
         "third_party/butteraugli/butteraugli/butteraugli.h",
+        "third_party/libjpeg/*.h",
         "clguetzli/*.cpp",
         "clguetzli/*.h"
       }

From 4058d6ed5889da49a7e467bd403498ca26d8cccf Mon Sep 17 00:00:00 2001
From: zhantong <zhantong1994@163.com>
Date: Mon, 10 Jul 2017 17:54:01 +0800
Subject: [PATCH 176/189] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dlibjpeg=E5=BA=93?=
 =?UTF-8?q?=E5=9C=A8debug=E5=92=8C32=E4=BD=8D=E4=B8=8B=E7=BC=96=E8=AF=91?=
 =?UTF-8?q?=E4=B8=8D=E6=88=90=E5=8A=9F=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 guetzli.vcxproj         | 17 ++++++++++-------
 guetzli.vcxproj.filters | 12 ++++++++++++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index ae2e8fc7..32cc12c7 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -154,7 +154,7 @@
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32;C:\Users\tongzhan\GitHub\guetzli\third_party\libjpeg\x86</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent>
       <Command>
@@ -171,7 +171,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -179,9 +179,9 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent />
   </ItemDefinitionGroup>
@@ -189,7 +189,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -197,11 +197,11 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
-      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\Win32;third_party\libjpeg\x86</AdditionalLibraryDirectories>
     </Link>
     <PostBuildEvent />
     <Intel_OpenCL_Build_Rules>
@@ -244,6 +244,9 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
+    <ClInclude Include="third_party\libjpeg\jconfig.h" />
+    <ClInclude Include="third_party\libjpeg\jmorecfg.h" />
+    <ClInclude Include="third_party\libjpeg\jpeglib.h" />
     <ClInclude Include="third_party\libpng\png.h" />
     <ClInclude Include="third_party\libpng\pngconf.h" />
     <ClInclude Include="third_party\libpng\pngpriv.h" />
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 768a8128..785c7382 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -22,6 +22,9 @@
     <Filter Include="clguetzli">
       <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
     </Filter>
+    <Filter Include="third_party\libjpeg">
+      <UniqueIdentifier>{1ac67559-7330-41c7-9a6d-10c3abee000e}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -171,6 +174,15 @@
     <ClInclude Include="clguetzli\cumem_pool.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
+    <ClInclude Include="third_party\libjpeg\jconfig.h">
+      <Filter>third_party\libjpeg</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libjpeg\jmorecfg.h">
+      <Filter>third_party\libjpeg</Filter>
+    </ClInclude>
+    <ClInclude Include="third_party\libjpeg\jpeglib.h">
+      <Filter>third_party\libjpeg</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">

From c100839e9975bb0ade5b6cff52a8feaede0198cb Mon Sep 17 00:00:00 2001
From: ianhuang-777 <306168910@qq.com>
Date: Tue, 11 Jul 2017 10:36:23 +0800
Subject: [PATCH 177/189] Translate the comment.

---
 clguetzli/clguetzli.cl       | 53 +++++++++++-------------------------
 clguetzli/clguetzli.cl.cpp   | 29 --------------------
 clguetzli/clguetzli.cu       | 21 --------------
 clguetzli/clguetzli_test.cpp | 10 -------
 clguetzli/cumem_pool.h       |  2 ++
 clguetzli/ocu.cpp            | 30 --------------------
 guetzli/processor.cc         |  5 ++--
 7 files changed, 21 insertions(+), 129 deletions(-)

diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index f0e16db0..b4d11a92 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -13,7 +13,7 @@
 #define kBlockHalf      (kBlockEdge * kBlockEdgeHalf)
 #define kComputeBlockSize (kBlockSize * 3)
 
-// IntFloatPair��Ϊ��ģ��output_order input_order��vector
+// IntFloatPair: opencl version of output_order/input_order
 typedef struct __IntFloatPair
 {
     int   idx;
@@ -734,21 +734,20 @@ __kernel void clAddBorderEx(__global float *out, const int xsize, const int ysiz
 
 }
 
-// batch��ָ�Ѿ���ά��չ��Ϊ��һά��
 __kernel void clComputeBlockZeroingOrderEx(
-    __global const coeff_t *orig_batch_0,       // ԭʼͼ��ϵ��
-    __global const coeff_t *orig_batch_1,       // ԭʼͼ��ϵ��
-    __global const coeff_t *orig_batch_2,       // ԭʼͼ��ϵ��
-    __global const float   *orig_image_batch,   // ԭʼͼ��pregamma
-    __global const float   *mask_scale,         // ԭʼͼ���ĳ�����ز���
+    __global const coeff_t *orig_batch_0,       // Coeffs of Original image.
+    __global const coeff_t *orig_batch_1,       // Coeffs of Original image.
+    __global const coeff_t *orig_batch_2,       // Coeffs of Original image.
+    __global const float   *orig_image_batch,   // pregamma of Original image..
+    __global const float   *mask_scale,         // mask_scale of Original image..
     const int              block_xsize,
     const int              block_ysize,
     const int              image_width,
     const int              image_height,
 
-    __global const coeff_t *mayout_batch_0,     // �����ѡͼ��ϵ��
-    __global const coeff_t *mayout_batch_1,     // �����ѡͼ��ϵ��
-    __global const coeff_t *mayout_batch_2,     // �����ѡͼ��ϵ��
+    __global const coeff_t *mayout_batch_0,     // Coeffs of output image.
+    __global const coeff_t *mayout_batch_1,     // Coeffs of output image.
+    __global const coeff_t *mayout_batch_2,     // Coeffs of output image.
     __global const ushort  *mayout_pixel_0,
     __global const ushort  *mayout_pixel_1,
     __global const ushort  *mayout_pixel_2,
@@ -756,8 +755,8 @@ __kernel void clComputeBlockZeroingOrderEx(
     const channel_info     mayout_channel_0,
     const channel_info     mayout_channel_1,
     const channel_info     mayout_channel_2,
-    const int factor,                                 // ��ǰ���������factor
-    const int comp_mask,                              // ��ǰ���������channel
+    const int factor,                                 // Current factor in computing.
+    const int comp_mask,                              // Current channel in computing.
     const float BlockErrorLimit,
     __global CoeffData *output_order_list/*out*/)
 {
@@ -779,7 +778,7 @@ __kernel void clComputeBlockZeroingOrderEx(
     mayout_channel[1].pixel = mayout_pixel_1;
     mayout_channel[2].pixel = mayout_pixel_2;
 
-    int block_idx = 0;        // ��������mask���е�channel������indx
+    int block_idx = 0;
 
     coeff_t mayout_block[kComputeBlockSize] = { 0 };
     coeff_t orig_block[kComputeBlockSize]   = { 0 };
@@ -833,7 +832,7 @@ __kernel void clComputeBlockZeroingOrderEx(
         }
 
         if (best_err >= BlockErrorLimit)
-        {   // err������������ģ���������Ѿ�����ErrorLimit�������ļ�������������
+        {   // The input_order is an ascent vector, break when best_err exceed the error limit.
             break;
         }
         int idx = input_order.pData[best_i].idx;
@@ -843,7 +842,6 @@ __kernel void clComputeBlockZeroingOrderEx(
         list_push_back(&output_order, idx, best_err);
     }
 
-    // ע��output_order�����resize���ǰ�β������λ0
     float min_err = 1e10;
     for (int i = output_order.size - 1; i >= 0; --i) {
         min_err = min(min_err, output_order.pData[i].err);
@@ -855,7 +853,7 @@ __kernel void clComputeBlockZeroingOrderEx(
     int out_count = 0;
     for (int i = 0; i < kComputeBlockSize && i < output_order.size; i++)
     {
-        // ���˽ϴ��err���ⲿ�ֽ����˼���û������
+        // err exceeding the limit is no need to continue.
         if (output_order.pData[i].err <= BlockErrorLimit)
         {
             output_block[out_count].idx = output_order.pData[i].idx;
@@ -1573,8 +1571,6 @@ __device__ void RgbToXyb(double r, double g, double b, double *valx, double *val
     *valz = b;
 }
 
-// chrisk todo
-// return size
 __device__ int list_push_back(IntFloatPairList* list, int i, float f)
 {
 	list->pData[list->size].idx = i;
@@ -1582,8 +1578,6 @@ __device__ int list_push_back(IntFloatPairList* list, int i, float f)
     return ++list->size;
 }
 
-// chrisk todo
-// remove idx and return size
 __device__ int list_erase(IntFloatPairList* list, int idx)
 {
 	for (int i = idx; i < list->size - 1; i++)
@@ -1594,7 +1588,6 @@ __device__ int list_erase(IntFloatPairList* list, int idx)
     return --list->size;
 }
 
-// chrisk todo
 __device__  int SortInputOrder(DCTScoreData* input_order, int size)
 {
 	int i, j;
@@ -2010,8 +2003,6 @@ __device__ coeff_t _abs(coeff_t val)
 	return val >= 0 ? val : -val;
 }
 
-// chrisk todo
-// return the count of Non-zero item
 __device__ int MakeInputOrder(__global const coeff_t *block, __global const coeff_t *orig_block, IntFloatPairList *input_order, int block_size)
 {
 	int size = 0;
@@ -2763,7 +2754,6 @@ __device__ void YUVToImage(__private uchar yuv[3 * 8 * 8], float* r, float* g, f
 #undef lut
 }
 
-// chrisk todo
 __device__ void BlockToImage(__private const coeff_t block[8*8*3], float r[8*8], float g[8*8], float b[8*8], int inside_x, int inside_y)
 {
 	uchar idct[3][8 * 8];
@@ -2927,11 +2917,8 @@ __device__ void Convolution(size_t xsize, size_t ysize,
 	}
 }
 
-// ian todo
-// �����������output
 __device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, double border_ratio, float *output)
 {
-    // �ο�clBlurEx2��ʵ�֣�sigma = 1.1����ʱstep��diff�����ػ�Ϊ�̶�ֵ
 	const double sigma = 1.1;
 	double m = 2.25;  // Accuracy increases when m is increased.
 	const double scaler = -0.41322314049586772; // when sigma=1.1, scaler is -0.41322314049586772
@@ -2953,7 +2940,6 @@ __device__ void BlurEx(const float *r, int xsize, int ysize, double kSigma, doub
               border_ratio, output);
 }
 
-// ian todo
 __device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g, __private float *b,
                             __private const float *r_blurred, __private const float *g_blurred, __private const float *b_blurred,
                             int size)
@@ -2983,7 +2969,6 @@ __device__ void OpsinDynamicsImageBlock(__private float *r, __private float *g,
   }
 }
 
-// chrisk todo
 __device__ void MaskHighIntensityChangeBlock(float *xyb0_x, float *xyb0_y, float *xyb0_b,
     float *xyb1_x, float *xyb1_y, float *xyb1_b,
     const float *c0_x, const float *c0_y, const float *c0_b,
@@ -3079,10 +3064,7 @@ __device__ void CalcOpsinDynamicsImage(__private float rgb[3][kDCTBlockSize])
 
 __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize], __private float rgb1_c[3][kDCTBlockSize], const __global float* mask_scale_block)
 { 
-//    return 0;       // 126ms 
-//    CalcOpsinDynamicsImage(rgb0_c);  -- calc in cpu one time
     CalcOpsinDynamicsImage(rgb1_c);     
-//    return 0;       // 425ms
 
     float rgb0[3][kDCTBlockSize];
     float rgb1[3][kDCTBlockSize];
@@ -3095,9 +3077,8 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
                                 rgb0_c[0], rgb0_c[1], rgb0_c[2],
                                 rgb1_c[0], rgb1_c[1], rgb1_c[2],
                                 8, 8);
-//    return 0;       // 544ms
-    // ����ΪɶҪ��floatת��double���ܼ��������㣿
-    double b0[3 * kDCTBlockSize];       // 
+
+    double b0[3 * kDCTBlockSize];       
     double b1[3 * kDCTBlockSize];
     for (int c = 0; c < 3; ++c) {
         for (int ix = 0; ix < kDCTBlockSize; ++ix) {
@@ -3111,7 +3092,6 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
     double diff_xyz_edge_dc[3] = { 0.0 };
     ButteraugliBlockDiff(b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
 
-//    return 0;       // 735ms
     double diff = 0.0;
     double diff_edge = 0.0;
 
@@ -3123,7 +3103,6 @@ __device__ double ComputeImage8x8Block(__private float rgb0_c[3][kDCTBlockSize],
     const double kEdgeWeight = 0.05;
     return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
 
-//   750ms
 }
 
 // return the count of Non-zero item
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index f29a283c..45533f60 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -212,35 +212,6 @@ namespace guetzli
     double ButteraugliComparatorEx::CompareBlock(const OutputImage& img, int off_x, int off_y, const coeff_t* candidate_block, const int comp_mask) const
     {
         double err = ButteraugliComparator::CompareBlock(img, off_x, off_y, candidate_block, comp_mask);
-/*
-        if (g_checkOpenCL)
-        {
-            channel_info mayout_channel[3];
-            for (int c = 0; c < 3; c++)
-            {
-                mayout_channel[c].block_height = img.component(c).height_in_blocks();
-                mayout_channel[c].block_width = img.component(c).width_in_blocks();
-                mayout_channel[c].factor = img.component(c).factor_x();
-                mayout_channel[c].pixel = img.component(c).pixels();
-                mayout_channel[c].coeff = img.component(c).coeffs();
-            }
-
-            double err2 = CompareBlockFactor(mayout_channel,
-                candidate_block,
-                block_x_,
-                block_y_,
-                imgOpsinDynamicsBlockList.data(),
-                imgMaskXyzScaleBlockList.data(),
-                width_,
-                height_,
-                factor_x_);
-
-            if (fabs(err - err2) > 0.001)
-            {
-                LogError("CompareBlock miss %s(%d) \r\n", __FUNCTION__, __LINE__);
-            }
-        }
-*/
         return err;
     }
 }
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index 351bed47..974be98e 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1,22 +1 @@
 #include "clguetzli/clguetzli.cl"
-/*
-__device__ int get_global_id(int dim)
-{
-    switch (dim)
-    {
-    case 0:
-        return threadIdx.x;
-    case 1:
-        return threadIdx.y;
-    case 2:
-        return threadIdx.z;
-    default:
-        return threadIdx.x;
-    }
-}
-
-__device__ int get_global_size(int dim)
-{
-    return 0;
-}
-*/
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index b5fa50c5..967a6652 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -69,7 +69,6 @@ void tclMaskHighIntensityChange(const float* r, const float* g, const float* b,
 	ocl.releaseMemChannels(xyb1);
 }
 
-// strong to
 void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
@@ -101,7 +100,6 @@ void tclEdgeDetectorMap(const float* r, const float* g, const float* b,
 	clReleaseMemObject(edge);
 }
 
-// strong todo
 void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
@@ -140,7 +138,6 @@ void tclBlockDiffMap(const float* r, const float* g, const float* b,
 	clReleaseMemObject(block_diff_dc);
 }
 
-// strong to
 void tclEdgeDetectorLowFreq(const float* r, const float* g, const float* b,
 	const float* r2, const float* g2, const float* b2,
 	size_t xsize, size_t ysize, size_t step,
@@ -258,7 +255,6 @@ void tclCombineChannels(const float *mask_xyb_x, const float *mask_xyb_y, const
 	clReleaseMemObject(cl_result);
 }
 
-// ian todo
 void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	const size_t step,
 	const float *diffmap, size_t org_len,
@@ -278,7 +274,6 @@ void tclCalculateDiffmap(const size_t xsize, const size_t ysize,
 	clReleaseMemObject(mem_diffmap);
 }
 
-// chrisk todo
 void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, double border_ratio, const float* result)
 {
     size_t channel_size = xsize * ysize * sizeof(float);
@@ -299,7 +294,6 @@ void tclBlur(const float* channel, size_t xsize, size_t ysize, double sigma, dou
     clReleaseMemObject(r);
 }
 
-// chrisk todo
 void tclConvolution(size_t xsize, size_t ysize,
 	size_t xstep,
 	size_t len, size_t offset,
@@ -333,7 +327,6 @@ void tclConvolution(size_t xsize, size_t ysize,
 	clReleaseMemObject(m);
 }
 
-// ian todo
 void tclDiffPrecompute(
   const std::vector<std::vector<float> > &xyb0,
   const std::vector<std::vector<float> > &xyb1,
@@ -366,7 +359,6 @@ void tclDiffPrecompute(
   ocl.releaseMemChannels(cl_mask);
 }
 
-// ian todo
 void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, const std::vector<float> &diffs_cmp)
 {
   cl_int err = 0;
@@ -382,7 +374,6 @@ void tclAverage5x5(int xsize, int ysize, const std::vector<float> &diffs_org, co
   clReleaseMemObject(mem_diff);
 }
 
-// chrisk todo
 void tclMinSquareVal(const float *img, size_t square_size, size_t offset,
 	size_t xsize, size_t ysize,
 	const float *result)
@@ -422,7 +413,6 @@ void tclScaleImage(double scale, const float *result_org, const float *result_cm
     clReleaseMemObject(mem_result_org);
 }
 
-// strong todo
 void tclOpsinDynamicsImage(const float* r, const float* g, const float* b, size_t xsize, size_t ysize,
 	const float* result_r, const float* result_g, const float* result_b)
 {
diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h
index 262f4106..d2ceec04 100644
--- a/clguetzli/cumem_pool.h
+++ b/clguetzli/cumem_pool.h
@@ -6,6 +6,8 @@
 #include <cuda.h>
 #include "ocl.h"
 
+/*Simple memory pool for CUDA, aiming to reduce the memory allocation count, because it's time consuming.*/
+
 enum mem_block_status
 {
     MBS_IDLE,
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 7ebc0ac1..2afe793d 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -32,35 +32,6 @@ ocu_args_d_t& getOcu(void)
     cuDeviceGetAttribute(&proc_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
     cuDeviceGetAttribute(&thread_count, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
     LogError("CUDA Adapter:%s Ver%d.%d MP %d MaxThread Per MP %d)\r\n", name, cap_major, cap_minor, proc_count, thread_count);
-/*
-    char* source = nullptr;
-    size_t src_size = 0;
-    ReadSourceFromFile("clguetzli/clguetzli.cl", &source, &src_size);
-
-    nvrtcProgram prog;
-    const char *opts[] = { "-arch=compute_30", "-default-device", "-G", "-I\"./\"", "--fmad=false" };
-    nvrtcCreateProgram(&prog, source, "clguetzli.cl", 0, NULL, NULL);
-    nvrtcResult compile_result;// = nvrtcCompileProgram(prog, 3, opts);
-    if (NVRTC_SUCCESS != compile_result)
-    {
-        // Obtain compilation log from the program.
-        size_t logSize = 0;
-        nvrtcGetProgramLogSize(prog, &logSize);
-        char *log = new char[logSize];
-        nvrtcGetProgramLog(prog, log);
-
-        LogError("BuildInfo:\r\n%s\r\n", log);
-
-        delete[] log;
-    }
-
-    delete[] source;
-    // Obtain PTX from the program.
-    size_t ptxSize = 0;
-    nvrtcGetPTXSize(prog, &ptxSize);
-    char *ptx = new char[ptxSize];
-    nvrtcGetPTX(prog, ptx);
-*/
 
     char* ptx = nullptr;
     size_t src_size = 0;
@@ -127,7 +98,6 @@ ocu_args_d_t::~ocu_args_d_t()
     cuModuleUnload(mod);
     cuCtxDestroy(ctxt);
     mem_pool.drain();
-//    cuStreamDestroy(commandQueue);
 }
 
 cu_mem ocu_args_d_t::allocMem(size_t s, const void *init)
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index f0a0bf48..ffbf6f24 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -455,7 +455,8 @@ void Processor::ComputeBlockZeroingOrder(
 	  if (MODE_CPU_OPT == g_mathMode)
 	  {
 		  if (best_err >= comparator_->BlockErrorLimit())
-		  {   // err������������ģ���������Ѿ�����ErrorLimit�������ļ�������������
+		  {   
+              // The input_order is an ascent vector, break when best_err exceed the error limit.
 			  break;
 		  }
 	  }
@@ -567,7 +568,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
     const int num_blocks = block_width * block_height;
 
 
-    comparator_->StartBlockComparisons(); // ��ʼ��һЩ��������Ҫ�Ƕ�ԭͼ����һЩ����
+    comparator_->StartBlockComparisons();
 
     std::vector<CoeffData> output_order_gpu;
     std::vector<CoeffData> output_order_cpu;

From 5f309e7bc922e1dcca3d304feb801eeebee8fb75 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 11 Jul 2017 11:10:52 +0800
Subject: [PATCH 178/189] Remove some redundant files

---
 guetzli.vcxproj                |  68 ++------------
 guetzli.vcxproj.filters        | 167 ---------------------------------
 guetzli/processor.cc           |   2 +-
 guetzli_static.vcxproj         |  51 ----------
 guetzli_static.vcxproj.filters | 155 ------------------------------
 5 files changed, 9 insertions(+), 434 deletions(-)

diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index 32cc12c7..c4eb7a8f 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -102,7 +102,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
@@ -114,7 +114,7 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
@@ -138,7 +138,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
@@ -150,7 +150,7 @@
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
@@ -171,7 +171,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\gperftools-gperftools-2.5\src;third_party\gperftools-gperftools-2.5\src\windows;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -179,7 +179,7 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
@@ -189,7 +189,7 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;third_party\libpng;third_party\zlib;third_party\libjpeg;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
@@ -197,7 +197,7 @@
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;jpeg.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
@@ -244,23 +244,6 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
-    <ClInclude Include="third_party\libjpeg\jconfig.h" />
-    <ClInclude Include="third_party\libjpeg\jmorecfg.h" />
-    <ClInclude Include="third_party\libjpeg\jpeglib.h" />
-    <ClInclude Include="third_party\libpng\png.h" />
-    <ClInclude Include="third_party\libpng\pngconf.h" />
-    <ClInclude Include="third_party\libpng\pngpriv.h" />
-    <ClInclude Include="third_party\zlib\crc32.h" />
-    <ClInclude Include="third_party\zlib\deflate.h" />
-    <ClInclude Include="third_party\zlib\gzguts.h" />
-    <ClInclude Include="third_party\zlib\inffast.h" />
-    <ClInclude Include="third_party\zlib\inffixed.h" />
-    <ClInclude Include="third_party\zlib\inflate.h" />
-    <ClInclude Include="third_party\zlib\inftrees.h" />
-    <ClInclude Include="third_party\zlib\trees.h" />
-    <ClInclude Include="third_party\zlib\zconf.h" />
-    <ClInclude Include="third_party\zlib\zlib.h" />
-    <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="clguetzli\clbutter_comparator.cpp" />
@@ -293,36 +276,6 @@
     <ClCompile Include="guetzli\quantize.cc" />
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
-    <ClCompile Include="third_party\libpng\png.c" />
-    <ClCompile Include="third_party\libpng\pngerror.c" />
-    <ClCompile Include="third_party\libpng\pngget.c" />
-    <ClCompile Include="third_party\libpng\pngmem.c" />
-    <ClCompile Include="third_party\libpng\pngpread.c" />
-    <ClCompile Include="third_party\libpng\pngread.c" />
-    <ClCompile Include="third_party\libpng\pngrio.c" />
-    <ClCompile Include="third_party\libpng\pngrtran.c" />
-    <ClCompile Include="third_party\libpng\pngrutil.c" />
-    <ClCompile Include="third_party\libpng\pngset.c" />
-    <ClCompile Include="third_party\libpng\pngtrans.c" />
-    <ClCompile Include="third_party\libpng\pngwio.c" />
-    <ClCompile Include="third_party\libpng\pngwrite.c" />
-    <ClCompile Include="third_party\libpng\pngwtran.c" />
-    <ClCompile Include="third_party\libpng\pngwutil.c" />
-    <ClCompile Include="third_party\zlib\adler32.c" />
-    <ClCompile Include="third_party\zlib\compress.c" />
-    <ClCompile Include="third_party\zlib\crc32.c" />
-    <ClCompile Include="third_party\zlib\deflate.c" />
-    <ClCompile Include="third_party\zlib\gzclose.c" />
-    <ClCompile Include="third_party\zlib\gzlib.c" />
-    <ClCompile Include="third_party\zlib\gzread.c" />
-    <ClCompile Include="third_party\zlib\gzwrite.c" />
-    <ClCompile Include="third_party\zlib\infback.c" />
-    <ClCompile Include="third_party\zlib\inffast.c" />
-    <ClCompile Include="third_party\zlib\inflate.c" />
-    <ClCompile Include="third_party\zlib\inftrees.c" />
-    <ClCompile Include="third_party\zlib\trees.c" />
-    <ClCompile Include="third_party\zlib\uncompr.c" />
-    <ClCompile Include="third_party\zlib\zutil.c" />
   </ItemGroup>
   <ItemGroup>
     <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
@@ -351,11 +304,6 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
     </CustomBuild>
-    <None Include="third_party\libpng\pngwin.def" />
-    <None Include="third_party\zlib\inffas32.asm" />
-    <None Include="third_party\zlib\match32.asm" />
-    <None Include="third_party\zlib\match686.asm" />
-    <None Include="third_party\zlib\zlib.def" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/guetzli.vcxproj.filters b/guetzli.vcxproj.filters
index 785c7382..7e005105 100644
--- a/guetzli.vcxproj.filters
+++ b/guetzli.vcxproj.filters
@@ -13,18 +13,9 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
-    <Filter Include="third_party\libpng">
-      <UniqueIdentifier>{40be58d6-6dfc-45a3-8ca1-7d1b14051ddc}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="third_party\zlib">
-      <UniqueIdentifier>{cb89c1ac-8399-4814-88f2-4b69576bc9f9}</UniqueIdentifier>
-    </Filter>
     <Filter Include="clguetzli">
       <UniqueIdentifier>{64847a89-ca39-4556-ba0e-d6875c4d39ca}</UniqueIdentifier>
     </Filter>
-    <Filter Include="third_party\libjpeg">
-      <UniqueIdentifier>{1ac67559-7330-41c7-9a6d-10c3abee000e}</UniqueIdentifier>
-    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -105,48 +96,6 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
-    <ClInclude Include="third_party\libpng\png.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngconf.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngpriv.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\crc32.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\deflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\gzguts.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffast.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffixed.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inftrees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\trees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zconf.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zlib.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zutil.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
     <ClInclude Include="clguetzli\utils.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
@@ -174,15 +123,6 @@
     <ClInclude Include="clguetzli\cumem_pool.h">
       <Filter>clguetzli</Filter>
     </ClInclude>
-    <ClInclude Include="third_party\libjpeg\jconfig.h">
-      <Filter>third_party\libjpeg</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libjpeg\jmorecfg.h">
-      <Filter>third_party\libjpeg</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libjpeg\jpeglib.h">
-      <Filter>third_party\libjpeg</Filter>
-    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -248,96 +188,6 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
-    <ClCompile Include="third_party\libpng\png.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngerror.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngget.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngmem.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngpread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngset.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngtrans.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwrite.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\adler32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\compress.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\crc32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\deflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzclose.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzlib.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzread.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzwrite.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\infback.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inffast.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inftrees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\trees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\uncompr.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\zutil.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
     <ClCompile Include="clguetzli\utils.cpp">
       <Filter>clguetzli</Filter>
     </ClCompile>
@@ -366,23 +216,6 @@
       <Filter>clguetzli</Filter>
     </ClCompile>
   </ItemGroup>
-  <ItemGroup>
-    <None Include="third_party\libpng\pngwin.def">
-      <Filter>third_party\libpng</Filter>
-    </None>
-    <None Include="third_party\zlib\inffas32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match686.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\zlib.def">
-      <Filter>third_party\zlib</Filter>
-    </None>
-  </ItemGroup>
   <ItemGroup>
     <Intel_OpenCL_Build_Rules Include="clguetzli\clguetzli.cl">
       <Filter>clguetzli</Filter>
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index ffbf6f24..d1cdb32a 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -33,7 +33,7 @@
 #include "guetzli/quantize.h"
 #include "clguetzli/clguetzli.h"
 
-#include "third_party/libjpeg/jpeglib.h"
+#include "jpeglib.h"
 
 namespace guetzli {
 
diff --git a/guetzli_static.vcxproj b/guetzli_static.vcxproj
index 05a75f9a..3c3bd850 100644
--- a/guetzli_static.vcxproj
+++ b/guetzli_static.vcxproj
@@ -176,20 +176,6 @@
     <ClInclude Include="guetzli\score.h" />
     <ClInclude Include="guetzli\stats.h" />
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h" />
-    <ClInclude Include="third_party\libpng\png.h" />
-    <ClInclude Include="third_party\libpng\pngconf.h" />
-    <ClInclude Include="third_party\libpng\pngpriv.h" />
-    <ClInclude Include="third_party\zlib\crc32.h" />
-    <ClInclude Include="third_party\zlib\deflate.h" />
-    <ClInclude Include="third_party\zlib\gzguts.h" />
-    <ClInclude Include="third_party\zlib\inffast.h" />
-    <ClInclude Include="third_party\zlib\inffixed.h" />
-    <ClInclude Include="third_party\zlib\inflate.h" />
-    <ClInclude Include="third_party\zlib\inftrees.h" />
-    <ClInclude Include="third_party\zlib\trees.h" />
-    <ClInclude Include="third_party\zlib\zconf.h" />
-    <ClInclude Include="third_party\zlib\zlib.h" />
-    <ClInclude Include="third_party\zlib\zutil.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc" />
@@ -212,43 +198,6 @@
     <ClCompile Include="guetzli\quantize.cc" />
     <ClCompile Include="guetzli\score.cc" />
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc" />
-    <ClCompile Include="third_party\libpng\png.c" />
-    <ClCompile Include="third_party\libpng\pngerror.c" />
-    <ClCompile Include="third_party\libpng\pngget.c" />
-    <ClCompile Include="third_party\libpng\pngmem.c" />
-    <ClCompile Include="third_party\libpng\pngpread.c" />
-    <ClCompile Include="third_party\libpng\pngread.c" />
-    <ClCompile Include="third_party\libpng\pngrio.c" />
-    <ClCompile Include="third_party\libpng\pngrtran.c" />
-    <ClCompile Include="third_party\libpng\pngrutil.c" />
-    <ClCompile Include="third_party\libpng\pngset.c" />
-    <ClCompile Include="third_party\libpng\pngtrans.c" />
-    <ClCompile Include="third_party\libpng\pngwio.c" />
-    <ClCompile Include="third_party\libpng\pngwrite.c" />
-    <ClCompile Include="third_party\libpng\pngwtran.c" />
-    <ClCompile Include="third_party\libpng\pngwutil.c" />
-    <ClCompile Include="third_party\zlib\adler32.c" />
-    <ClCompile Include="third_party\zlib\compress.c" />
-    <ClCompile Include="third_party\zlib\crc32.c" />
-    <ClCompile Include="third_party\zlib\deflate.c" />
-    <ClCompile Include="third_party\zlib\gzclose.c" />
-    <ClCompile Include="third_party\zlib\gzlib.c" />
-    <ClCompile Include="third_party\zlib\gzread.c" />
-    <ClCompile Include="third_party\zlib\gzwrite.c" />
-    <ClCompile Include="third_party\zlib\infback.c" />
-    <ClCompile Include="third_party\zlib\inffast.c" />
-    <ClCompile Include="third_party\zlib\inflate.c" />
-    <ClCompile Include="third_party\zlib\inftrees.c" />
-    <ClCompile Include="third_party\zlib\trees.c" />
-    <ClCompile Include="third_party\zlib\uncompr.c" />
-    <ClCompile Include="third_party\zlib\zutil.c" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="third_party\libpng\pngwin.def" />
-    <None Include="third_party\zlib\inffas32.asm" />
-    <None Include="third_party\zlib\match32.asm" />
-    <None Include="third_party\zlib\match686.asm" />
-    <None Include="third_party\zlib\zlib.def" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/guetzli_static.vcxproj.filters b/guetzli_static.vcxproj.filters
index 37876e3d..94654c91 100644
--- a/guetzli_static.vcxproj.filters
+++ b/guetzli_static.vcxproj.filters
@@ -13,12 +13,6 @@
     <Filter Include="third_party\butteraugli\butteraugli">
       <UniqueIdentifier>{FD6FCB41-6929-36EC-F288-50C65E41EC5B}</UniqueIdentifier>
     </Filter>
-    <Filter Include="third_party\libpng">
-      <UniqueIdentifier>{61f0e3eb-c213-49c5-883a-060bdaf927bb}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="third_party\zlib">
-      <UniqueIdentifier>{ba7b6163-a7d1-4f14-b4b3-3d35f296563a}</UniqueIdentifier>
-    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="guetzli\butteraugli_comparator.h">
@@ -99,48 +93,6 @@
     <ClInclude Include="third_party\butteraugli\butteraugli\butteraugli.h">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClInclude>
-    <ClInclude Include="third_party\libpng\png.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngconf.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\libpng\pngpriv.h">
-      <Filter>third_party\libpng</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\crc32.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\deflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\gzguts.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffast.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inffixed.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inflate.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\inftrees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\trees.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zconf.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zlib.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
-    <ClInclude Include="third_party\zlib\zutil.h">
-      <Filter>third_party\zlib</Filter>
-    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="guetzli\butteraugli_comparator.cc">
@@ -203,112 +155,5 @@
     <ClCompile Include="third_party\butteraugli\butteraugli\butteraugli.cc">
       <Filter>third_party\butteraugli\butteraugli</Filter>
     </ClCompile>
-    <ClCompile Include="third_party\libpng\png.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngerror.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngget.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngmem.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngpread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngread.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngrutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngset.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngtrans.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwio.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwrite.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwtran.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\libpng\pngwutil.c">
-      <Filter>third_party\libpng</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\adler32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\compress.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\crc32.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\deflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzclose.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzlib.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzread.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\gzwrite.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\infback.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inffast.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inflate.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\inftrees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\trees.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\uncompr.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-    <ClCompile Include="third_party\zlib\zutil.c">
-      <Filter>third_party\zlib</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="third_party\libpng\pngwin.def">
-      <Filter>third_party\libpng</Filter>
-    </None>
-    <None Include="third_party\zlib\inffas32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match32.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\match686.asm">
-      <Filter>third_party\zlib</Filter>
-    </None>
-    <None Include="third_party\zlib\zlib.def">
-      <Filter>third_party\zlib</Filter>
-    </None>
   </ItemGroup>
 </Project>
\ No newline at end of file

From 5aa73ae39a460ba7f5f8a8fd52c47e7b8bf53c8b Mon Sep 17 00:00:00 2001
From: strongtu <tuqbasic@qq.com>
Date: Tue, 11 Jul 2017 11:15:25 +0800
Subject: [PATCH 179/189] Modify makefile

---
 guetzli.make | 4 ++--
 premake5.lua | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/guetzli.make b/guetzli.make
index a458eb09..52dbff8f 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -15,7 +15,7 @@ ifeq ($(config),release)
   TARGETDIR = bin/Release
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
-  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__
+  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
@@ -42,7 +42,7 @@ ifeq ($(config),debug)
   TARGETDIR = bin/Debug
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Debug/guetzli
-  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__ -D__USE_GPERFTOOLS__
+  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__
   INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
diff --git a/premake5.lua b/premake5.lua
index 1c5ef6c6..7f2cc3e3 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -31,7 +31,6 @@ workspace "guetzli"
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
         "third_party/butteraugli/butteraugli/butteraugli.h",
-        "third_party/libjpeg/*.h",
         "clguetzli/*.cpp",
         "clguetzli/*.h"
       }
@@ -43,7 +42,7 @@ workspace "guetzli"
   project "guetzli"
     kind "ConsoleApp"
     filter "action:gmake"
-	  defines { "__USE_OPENCL__", "__USE_CUDA__", "__USE_GPERFTOOLS__" }
+	  defines { "__USE_OPENCL__", "__USE_CUDA__" }
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
       links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" }
@@ -56,7 +55,6 @@ workspace "guetzli"
         "guetzli/*.h",
         "third_party/butteraugli/butteraugli/butteraugli.cc",
         "third_party/butteraugli/butteraugli/butteraugli.h",
-        "third_party/libjpeg/*.h",
         "clguetzli/*.cpp",
         "clguetzli/*.h"
       }

From c525adf38cb99ae64d4de5a5954e992b38ed1714 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 00:41:55 +0800
Subject: [PATCH 180/189] Disable CUDA & OpenCL by default

---
 clguetzli/cl.hpp             |  4 ++++
 clguetzli/clguetzli.cl       |  3 +++
 clguetzli/clguetzli.cl.h     |  6 +++++-
 clguetzli/clguetzli_test.cpp |  4 ++--
 clguetzli/ocl.h              |  4 ++--
 clguetzli/ocu.cpp            |  4 ++--
 clguetzli/utils.cpp          |  5 ++++-
 guetzli.make                 | 16 ++++++++--------
 guetzli.vcxproj              | 24 ++++++++++++------------
 guetzli/processor.cc         | 11 +++++++++++
 guetzli_static.make          |  8 ++++----
 premake5.lua                 |  8 ++++----
 12 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/clguetzli/cl.hpp b/clguetzli/cl.hpp
index 8be6313e..a7043b50 100644
--- a/clguetzli/cl.hpp
+++ b/clguetzli/cl.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#ifdef __USE_OPENCL__
+
 template<typename T>
 inline void clSetKernelArgK(cl_kernel k, int idx, T* t)
 {
@@ -316,3 +318,5 @@ inline void clSetKernelArgEx(cl_kernel k,
     clSetKernelArgK(k, 24, t24);
     clSetKernelArgEx(k, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22, t23);
 }
+
+#endif // __USE_OPENCL__
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index b4d11a92..c2e67e80 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,3 +1,4 @@
+#ifdef __USE_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 #include  "clguetzli/clguetzli.cl.h"
@@ -3408,3 +3409,5 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
 #ifdef __USE_DOUBLE_AS_FLOAT__
 #undef double
 #endif
+
+#endif __USE_OPENCL__
\ No newline at end of file
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 102f3ac9..761ed634 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -1,6 +1,8 @@
 #ifndef __CLGUETZLI_CL_H__
 #define __CLGUETZLI_CL_H__
 
+#ifdef __USE_OPENCL__
+
 #ifdef __cplusplus
 #ifndef __CUDACC__
 #include "CL/cl.h"
@@ -148,4 +150,6 @@
         __global const ushort  *pixel;
     }channel_info;
 
-#endif /*__CLGUETZLI_CL_H__*/
\ No newline at end of file
+#endif /*__CLGUETZLI_CL_H__*/
+
+#endif // __USE_OPENCL__
\ No newline at end of file
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 967a6652..6e6fece8 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -1,3 +1,5 @@
+#ifdef __USE_OPENCL__
+
 #include <CL/cl.h>
 #include <math.h>
 #include <assert.h>
@@ -7,8 +9,6 @@
 #include "ocl.h"
 #include "ocu.h"
 
-#ifdef __USE_OPENCL__
-
 #define FLOAT_COMPARE(a, b, c)  floatCompare((a), (b), (c), __FUNCTION__, __LINE__ )
 
 int floatCompare(const float* a, const float* b, size_t size, const char* szFunc, int line)
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index f182bb88..f3056dd8 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -1,11 +1,11 @@
 #pragma once
 
+#ifdef __USE_OPENCL__
+
 #include "CL/cl.h"
 #include "utils.h"
 #include "clguetzli.cl.h"
 
-#ifdef __USE_OPENCL__
-
 // Macros for OpenCL versions
 #define OPENCL_VERSION_1_2  1.2f
 #define OPENCL_VERSION_2_0  2.0f
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index 2afe793d..ea66be55 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -1,8 +1,8 @@
 #include "ocu.h"
-#include <cuda.h>
-#include <nvrtc.h>
 
 #ifdef __USE_CUDA__
+#include <cuda.h>
+#include <nvrtc.h>
 
 ocu_args_d_t& getOcu(void)
 {
diff --git a/clguetzli/utils.cpp b/clguetzli/utils.cpp
index 4fc8dbc2..da699406 100644
--- a/clguetzli/utils.cpp
+++ b/clguetzli/utils.cpp
@@ -19,6 +19,7 @@
  * Intel Corporation is the author of the Materials, and requests that all
  * problem reports or change requests be submitted to it directly
  *****************************************************************************/
+#ifdef __USE_OPENCL__
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -96,4 +97,6 @@ int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
     }
     return errorCode;
 }
-#pragma warning( pop )
\ No newline at end of file
+#pragma warning( pop )
+
+#endif
\ No newline at end of file
diff --git a/guetzli.make b/guetzli.make
index 52dbff8f..e16aa99b 100644
--- a/guetzli.make
+++ b/guetzli.make
@@ -15,16 +15,16 @@ ifeq ($(config),release)
   TARGETDIR = bin/Release
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Release/guetzli
-  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__
-  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
+  DEFINES +=
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -O3 -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg
+  LIBS +=
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
   define PREBUILDCMDS
   endef
@@ -42,16 +42,16 @@ ifeq ($(config),debug)
   TARGETDIR = bin/Debug
   TARGET = $(TARGETDIR)/guetzli
   OBJDIR = obj/Debug/guetzli
-  DEFINES += -D__USE_OPENCL__ -D__USE_CUDA__
-  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
+  DEFINES +=
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_CXXFLAGS += $(CXXFLAGS) $(ALL_CPPFLAGS) -g -std=c++11 `pkg-config --cflags libpng || libpng-config --cflags`
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
-  LIBS += -lOpenCL -lcuda -lprofiler -lunwind -ljpeg
+  LIBS +=
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --libs libpng || libpng-config --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --libs libpng || libpng-config --ldflags`
   LINKCMD = $(CXX) -o "$@" $(OBJECTS) $(RESOURCES) $(ALL_LDFLAGS) $(LIBS)
   define PREBUILDCMDS
   endef
diff --git a/guetzli.vcxproj b/guetzli.vcxproj
index c4eb7a8f..3a0eb72c 100644
--- a/guetzli.vcxproj
+++ b/guetzli.vcxproj
@@ -102,19 +102,19 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
@@ -138,19 +138,19 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <Optimization>MaxSpeed</Optimization>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>false</IntrinsicFunctions>
       <MinimalRebuild>false</MinimalRebuild>
       <StringPooling>true</StringPooling>
-      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
@@ -171,15 +171,15 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>_UNICODE;UNICODE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <AdditionalLibraryDirectories>$(CUDA_PATH)\lib\x64;third_party\libjpeg\x64</AdditionalLibraryDirectories>
     </Link>
@@ -189,15 +189,15 @@
     <ClCompile>
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>.;$(CUDA_PATH)\include;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.;third_party\butteraugli;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>__USE_CUDA__;__USE_OPENCL__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>cuda.lib;OpenCL.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
       <ForceSymbolReferences>
       </ForceSymbolReferences>
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
index d1cdb32a..2e8837dc 100644
--- a/guetzli/processor.cc
+++ b/guetzli/processor.cc
@@ -33,7 +33,9 @@
 #include "guetzli/quantize.h"
 #include "clguetzli/clguetzli.h"
 
+#ifdef __SUPPORT_FULL_JPEG__
 #include "jpeglib.h"
+#endif
 
 namespace guetzli {
 
@@ -668,6 +670,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
         }
     }
 
+#ifdef __USE_OPENCL__
     if (MODE_CHECKCL == g_mathMode)
     {
         int count = 0;
@@ -685,6 +688,7 @@ void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img, co
             LogError("CHK %s(%d) %d:%d\r\n", "SelectFrequencyMasking", __LINE__, count, check_size);
         }
     }
+#endif
 
     std::vector<int> candidate_coeff_offsets(num_blocks + 1);
     std::vector<uint8_t> candidate_coeffs;
@@ -1063,6 +1067,7 @@ bool Process(const Params& params, ProcessStats* stats,
 bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats,
 	const std::string& data,
 	std::string* jpg_out) {
+#ifdef __SUPPORT_FULL_JPEG__
 	struct jpeg_decompress_struct cinfo;
 	struct jpeg_error_mgr jerr;
 	cinfo.err = jpeg_std_error(&jerr);
@@ -1092,6 +1097,12 @@ bool ProcessUnsupportedJpegData(const Params& params, ProcessStats* stats,
 	}
 	std::vector<uint8_t> temp_rgb(bmp_buffer, bmp_buffer + bmp_size);
 	return Process(params, stats, temp_rgb, xsize, ysize, jpg_out);
+#else
+	fprintf(stderr, "Unsupported input JPEG file (e.g. unsupported "
+		"downsampling mode).\nPlease provide the input image as "
+		"a PNG file.\n");
+	return false;
+#endif
 }
 
 bool Process(const Params& params, ProcessStats* stats,
diff --git a/guetzli_static.make b/guetzli_static.make
index 2d648c04..9fe7bf05 100644
--- a/guetzli_static.make
+++ b/guetzli_static.make
@@ -16,7 +16,7 @@ ifeq ($(config),release)
   TARGET = $(TARGETDIR)/libguetzli_static.a
   OBJDIR = obj/Release/guetzli_static
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -O3 -g `pkg-config --static --cflags libpng || libpng-config --static --cflags`
@@ -24,7 +24,7 @@ ifeq ($(config),release)
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
   LIBS +=
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags`
   LINKCMD = $(AR) -rcs "$@" $(OBJECTS)
   define PREBUILDCMDS
   endef
@@ -43,7 +43,7 @@ ifeq ($(config),debug)
   TARGET = $(TARGETDIR)/libguetzli_static.a
   OBJDIR = obj/Debug/guetzli_static
   DEFINES +=
-  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli -I"$(OPENCL_INC)"
+  INCLUDES += -I. -Ithird_party/butteraugli -Iclguetzli
   FORCE_INCLUDE +=
   ALL_CPPFLAGS += $(CPPFLAGS) -MMD -MP $(DEFINES) $(INCLUDES)
   ALL_CFLAGS += $(CFLAGS) $(ALL_CPPFLAGS) -g `pkg-config --static --cflags libpng || libpng-config --static --cflags`
@@ -51,7 +51,7 @@ ifeq ($(config),debug)
   ALL_RESFLAGS += $(RESFLAGS) $(DEFINES) $(INCLUDES)
   LIBS +=
   LDDEPS +=
-  ALL_LDFLAGS += $(LDFLAGS) -L"$(OPENCL_LIB)" `pkg-config --static --libs libpng || libpng-config --static --ldflags`
+  ALL_LDFLAGS += $(LDFLAGS) `pkg-config --static --libs libpng || libpng-config --static --ldflags`
   LINKCMD = $(AR) -rcs "$@" $(OBJECTS)
   define PREBUILDCMDS
   endef
diff --git a/premake5.lua b/premake5.lua
index 7f2cc3e3..cc41301b 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -2,8 +2,8 @@ workspace "guetzli"
   configurations { "Release", "Debug" }
   language "C++"
   flags { "C++11" }
-  includedirs { ".", "third_party/butteraugli", "clguetzli", "$(OPENCL_INC)" }
-  libdirs { "$(OPENCL_LIB)" }
+  includedirs { ".", "third_party/butteraugli", "clguetzli" }
+  libdirs {}
 
   filter "action:vs*"
     platforms { "x86_64", "x86" }
@@ -42,10 +42,10 @@ workspace "guetzli"
   project "guetzli"
     kind "ConsoleApp"
     filter "action:gmake"
-	  defines { "__USE_OPENCL__", "__USE_CUDA__" }
+	  --defines { "__USE_OPENCL__", "__USE_CUDA__", "__SUPPORT_FULL_JPEG__" }
       linkoptions { "`pkg-config --libs libpng || libpng-config --ldflags`" }
       buildoptions { "`pkg-config --cflags libpng || libpng-config --cflags`" }
-      links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" }
+      --links { "OpenCL", "cuda", "profiler", "unwind", "jpeg" }
     filter "action:vs*"
       links { "shlwapi" }
     filter {}

From ba219439ca41f95b8bd678966a8631735db6fbe1 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 10:19:40 +0800
Subject: [PATCH 181/189] Add netpbm

According to the CI fail log. no pngtopnm command in the test
environment, so add netpbm package and try again.
---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 39e1caaa..657e3e7b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,6 +13,7 @@ matrix:
           packages:
             - wget
             - libjpeg-progs
+            - netpbm
 
     - os: osx
       env: BUILD_SYSTEM=bazel
@@ -29,6 +30,7 @@ matrix:
             - libpng-dev
             - pkg-config
             - libjpeg-progs
+            - netpbm
 
     - os: osx
       env: BUILD_SYSTEM=make

From 93fd3f3abed6d064f39f8932d1c58f05ab99d382 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 10:39:13 +0800
Subject: [PATCH 182/189] Fix type cast error on Mac

---
 clguetzli/clbutter_comparator.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index e39966b1..91e599b5 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -730,9 +730,9 @@ void MaskHighIntensityChangeOpt(
 		for (size_t x = 0; x < xsize; ++x) {
 			size_t ix = y * xsize + x;
 			const float ave[3] = {
-				(c0[0][ix] + c1[0][ix]) * 0.5,
-				(c0[1][ix] + c1[1][ix]) * 0.5,
-				(c0[2][ix] + c1[2][ix]) * 0.5,
+				static_cast<float>((c0[0][ix] + c1[0][ix]) * 0.5),
+				static_cast<float>((c0[1][ix] + c1[1][ix]) * 0.5),
+				static_cast<float>((c0[2][ix] + c1[2][ix]) * 0.5),
 			};
 			float sqr_max_diff = -1;
 			{

From 1c1d7e641b8d482381385aaa3bfd5bde8340ebf9 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 11:25:39 +0800
Subject: [PATCH 183/189] Update bazel version to 0.5.2

---
 .travis.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.sh b/.travis.sh
index a30f38e5..b7197e7c 100755
--- a/.travis.sh
+++ b/.travis.sh
@@ -14,9 +14,9 @@ case "$1" in
 	    "bazel")
 		case "${TRAVIS_OS_NAME}" in
 		    "linux")
-			wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel_0.4.5-linux-x86_64.deb
-			echo 'b494d0a413e4703b6cd5312403bea4d92246d6425b3be68c9bfbeb8cc4db8a55  bazel_0.4.5-linux-x86_64.deb' | sha256sum -c --strict || exit 1
-			sudo dpkg -i bazel_0.4.5-linux-x86_64.deb
+			wget https://github.com/bazelbuild/bazel/releases/download/0.5.2/bazel_0.5.2-linux-x86_64.deb
+			echo 'b14c8773dab078d3422fe4082f3ab4d9e14f02313c3b3eb4b5b40c44ce29ed59  bazel_0.5.2-linux-x86_64.deb' | sha256sum -c --strict || exit 1
+			sudo dpkg -i bazel_0.5.2-linux-x86_64.deb
 			;;
 		    "osx")
 			brew install bazel

From 1cb26c7cc9373e0d4828f3bf265e6b25e0d45143 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 12:52:05 +0800
Subject: [PATCH 184/189] Add oracle-java8-installer

---
 .travis.sh  | 7 ++++---
 .travis.yml | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.travis.sh b/.travis.sh
index b7197e7c..905889ff 100755
--- a/.travis.sh
+++ b/.travis.sh
@@ -14,9 +14,10 @@ case "$1" in
 	    "bazel")
 		case "${TRAVIS_OS_NAME}" in
 		    "linux")
-			wget https://github.com/bazelbuild/bazel/releases/download/0.5.2/bazel_0.5.2-linux-x86_64.deb
-			echo 'b14c8773dab078d3422fe4082f3ab4d9e14f02313c3b3eb4b5b40c44ce29ed59  bazel_0.5.2-linux-x86_64.deb' | sha256sum -c --strict || exit 1
-			sudo dpkg -i bazel_0.5.2-linux-x86_64.deb
+			sudo apt-get remove oracle-java9-installer
+			wget https://github.com/bazelbuild/bazel/releases/download/0.4.5/bazel_0.4.5-linux-x86_64.deb
+			echo 'b494d0a413e4703b6cd5312403bea4d92246d6425b3be68c9bfbeb8cc4db8a55  bazel_0.4.5-linux-x86_64.deb' | sha256sum -c --strict || exit 1
+			sudo dpkg -i bazel_0.4.5-linux-x86_64.deb
 			;;
 		    "osx")
 			brew install bazel
diff --git a/.travis.yml b/.travis.yml
index 657e3e7b..9f297c16 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,7 @@ matrix:
             - wget
             - libjpeg-progs
             - netpbm
+            - oracle-java8-installer
 
     - os: osx
       env: BUILD_SYSTEM=bazel
@@ -37,6 +38,7 @@ matrix:
 
 
 install:
+- jdk_switcher use oraclejdk8
 - ./.travis.sh install
 script:
 - ./.travis.sh script

From 40665e219e344e25a005af1078bd7f4deab21136 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 15:11:12 +0800
Subject: [PATCH 185/189] Try to fix Bazel build

---
 .travis.yml                                   |  1 -
 BUILD                                         |  4 ++
 clguetzli/clbutter_comparator.cpp             |  3 +-
 .../butteraugli/butteraugli/butteraugli.cc    | 50 +++++++++++++++++++
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9f297c16..85db2b53 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,7 +38,6 @@ matrix:
 
 
 install:
-- jdk_switcher use oraclejdk8
 - ./.travis.sh install
 script:
 - ./.travis.sh script
diff --git a/BUILD b/BUILD
index 05bfc0da..8f2e28f1 100644
--- a/BUILD
+++ b/BUILD
@@ -8,6 +8,10 @@ cc_library(
             "guetzli/*.h",
             "guetzli/*.cc",
             "guetzli/*.inc",
+			"clguetzli/*.cpp",
+            "clguetzli/*.h",
+            "clguetzli/*.hpp"
+
         ],
         exclude = ["guetzli/guetzli.cc"],
     ),
diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 91e599b5..3d4eb7dd 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -1735,9 +1735,8 @@ namespace butteraugli
         float border_ratio,
         float* __restrict__ result)
     {
-        _Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
-
 #ifdef __USE_OPENCL__
+		_Convolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
         if (MODE_CHECKCL == g_mathMode && xsize > 8 && ysize > 8)
         {
             tclConvolution(xsize, ysize, xstep, len, offset, multipliers, inp, border_ratio, result);
diff --git a/third_party/butteraugli/butteraugli/butteraugli.cc b/third_party/butteraugli/butteraugli/butteraugli.cc
index b62e1578..c32f226c 100644
--- a/third_party/butteraugli/butteraugli/butteraugli.cc
+++ b/third_party/butteraugli/butteraugli/butteraugli.cc
@@ -40,9 +40,11 @@
 #include <algorithm>
 #include <array>
 
+#ifdef __USE_OPENCL__
 #include "clguetzli/clbutter_comparator.h"
 #include "clguetzli/clguetzli.h"
 #include "clguetzli/clguetzli_test.h"
+#endif
 
 // Restricted pointers speed up Convolution(); MSVC uses a different keyword.
 #ifdef _MSC_VER
@@ -112,17 +114,28 @@ void _Blur(size_t xsize, size_t ysize, float* channel, double sigma,
   int dxsize = (xsize + xstep - 1) / xstep;
   int dysize = (ysize + ystep - 1) / ystep;
   std::vector<float> tmp(dxsize * ysize);
+#ifdef __USE_OPENCL__
   Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
               border_ratio,
               tmp.data());
+#else
+  _Convolution(xsize, ysize, xstep, expn_size, diff, expn.data(), channel,
+	          border_ratio,
+	          tmp.data());
+#endif
   float* output = channel;
   std::vector<float> downsampled_output;
   if (xstep > 1) {
     downsampled_output.resize(dxsize * dysize);
     output = downsampled_output.data();
   }
+#ifdef __USE_OPENCL__
   Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
               border_ratio, output);
+#else
+  _Convolution(ysize, dxsize, ystep, expn_size, diff, expn.data(), tmp.data(),
+	          border_ratio, output);
+#endif
   if (xstep > 1) {
     for (size_t y = 0; y < ysize; y++) {
       for (size_t x = 0; x < xsize; x++) {
@@ -1022,7 +1035,11 @@ void _CalculateDiffmap(const size_t xsize, const size_t ysize,
             += static_cast<float>(mul1) * blurred[y * (xsize - s) + x];
       }
     }
+#ifdef __USE_OPENCL__
     ScaleImage(scale, diffmap);
+#else
+	_ScaleImage(scale, diffmap);
+#endif
   }
 }
 
@@ -1054,7 +1071,11 @@ void ButteraugliComparator::DiffmapOpsinDynamicsImage(
     CombineChannels(mask_xyb, mask_xyb_dc, block_diff_dc, block_diff_ac,
                     edge_detector_map, &result);
   }
+#ifdef __USE_OPENCL__
   CalculateDiffmap(xsize_, ysize_, step_, &result);
+#else
+  _CalculateDiffmap(xsize_, ysize_, step_, &result);
+#endif
 }
 
 void ButteraugliComparator::BlockDiffMap(
@@ -1366,7 +1387,11 @@ void _Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
   std::vector<float> result = *diffs;
   std::vector<float> tmp0 = *diffs;
   std::vector<float> tmp1 = *diffs;
+#ifdef __USE_OPENCL__
   ScaleImage(w, &tmp1);
+#else
+  _ScaleImage(w, &tmp1);
+#endif
   for (int y = 0; y < ysize; y++) {
     const int row0 = y * xsize;
     result[row0 + 1] += tmp0[row0];
@@ -1405,7 +1430,11 @@ void _Average5x5(int xsize, int ysize, std::vector<float>* diffs) {
     }
   }
   *diffs = result;
+#ifdef __USE_OPENCL__
   ScaleImage(scale, diffs);
+#else
+  _ScaleImage(scale, diffs);
+#endif
 }
 
 void _DiffPrecompute(
@@ -1473,6 +1502,7 @@ void _Mask(const std::vector<std::vector<float> > &xyb0,
   for (int i = 0; i < 3; ++i) {
     (*mask)[i].resize(xsize * ysize);
   }
+#ifdef __USE_OPENCL__
   DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
   for (int i = 0; i < 3; ++i) {
     Average5x5(xsize, ysize, &(*mask)[i]);
@@ -1484,6 +1514,19 @@ void _Mask(const std::vector<std::vector<float> > &xyb0,
     };
     Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0);
   }
+#else
+  _DiffPrecompute(xyb0, xyb1, xsize, ysize, mask);
+  for (int i = 0; i < 3; ++i) {
+	  _Average5x5(xsize, ysize, &(*mask)[i]);
+	  _MinSquareVal(4, 0, xsize, ysize, (*mask)[i].data());
+	  static const double sigma[3] = {
+		  9.65781083553,
+		  14.2644604355,
+		  4.53358927369,
+	  };
+	  _Blur(xsize, ysize, (*mask)[i].data(), sigma[i], 0.0);
+  }
+#endif
   static const double w00 = 232.206464018;
   static const double w11 = 22.9455222245;
   static const double w22 = 503.962310606;
@@ -1510,10 +1553,17 @@ void _Mask(const std::vector<std::vector<float> > &xyb0,
       (*mask_dc)[2][idx] = static_cast<float>(MaskDcB(p2));
     }
   }
+#ifdef __USE_OPENCL__
   for (int i = 0; i < 3; ++i) {
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]);
     ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
   }
+#else
+  for (int i = 0; i < 3; ++i) {
+    _ScaleImage(kGlobalScale * kGlobalScale, &(*mask)[i]);
+    _ScaleImage(kGlobalScale * kGlobalScale, &(*mask_dc)[i]);
+  }
+#endif
 }
 
 }  // namespace butteraugli

From 05ee2f8ef6d6acaba129ca0070ac7f151a305a69 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 19:38:49 +0800
Subject: [PATCH 186/189] Add author information

---
 BUILD                             | 3 +--
 clguetzli/clbutter_comparator.cpp | 7 +++++++
 clguetzli/clbutter_comparator.h   | 7 +++++++
 clguetzli/clguetzli.cl            | 7 +++++++
 clguetzli/clguetzli.cl.cpp        | 7 +++++++
 clguetzli/clguetzli.cl.h          | 7 +++++++
 clguetzli/clguetzli.cpp           | 7 +++++++
 clguetzli/clguetzli.cu            | 7 +++++++
 clguetzli/clguetzli.h             | 7 +++++++
 clguetzli/clguetzli_test.cpp      | 7 +++++++
 clguetzli/clguetzli_test.h        | 7 +++++++
 clguetzli/cuguetzli.cpp           | 7 +++++++
 clguetzli/cuguetzli.h             | 7 +++++++
 clguetzli/cumem_pool.cpp          | 6 ++++++
 clguetzli/cumem_pool.h            | 5 +++++
 clguetzli/ocl.cpp                 | 6 ++++++
 clguetzli/ocl.h                   | 6 ++++++
 clguetzli/ocu.cpp                 | 5 +++++
 clguetzli/ocu.h                   | 5 +++++
 19 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/BUILD b/BUILD
index 8f2e28f1..c88d3890 100644
--- a/BUILD
+++ b/BUILD
@@ -8,10 +8,9 @@ cc_library(
             "guetzli/*.h",
             "guetzli/*.cc",
             "guetzli/*.inc",
-			"clguetzli/*.cpp",
+            "clguetzli/*.cpp",
             "clguetzli/*.h",
             "clguetzli/*.hpp"
-
         ],
         exclude = ["guetzli/guetzli.cc"],
     ),
diff --git a/clguetzli/clbutter_comparator.cpp b/clguetzli/clbutter_comparator.cpp
index 3d4eb7dd..d91055d5 100644
--- a/clguetzli/clbutter_comparator.cpp
+++ b/clguetzli/clbutter_comparator.cpp
@@ -1,3 +1,10 @@
+/*
+* OpenCL/CUDA edition implementation of butter_comparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #include "clbutter_comparator.h"
 #include "clguetzli.h"
 #include "clguetzli_test.h"
diff --git a/clguetzli/clbutter_comparator.h b/clguetzli/clbutter_comparator.h
index c26de1de..76380785 100644
--- a/clguetzli/clbutter_comparator.h
+++ b/clguetzli/clbutter_comparator.h
@@ -1,3 +1,10 @@
+/*
+* OpenCL/CUDA edition implementation of butter_comparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #pragma once
 #include <vector>
 #include "butteraugli/butteraugli.h"
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index c2e67e80..2d18e8bd 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -1,3 +1,10 @@
+/*
+* OpenCL Kernels
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #ifdef __USE_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
diff --git a/clguetzli/clguetzli.cl.cpp b/clguetzli/clguetzli.cl.cpp
index 45533f60..619c0cfd 100644
--- a/clguetzli/clguetzli.cl.cpp
+++ b/clguetzli/clguetzli.cl.cpp
@@ -1,3 +1,10 @@
+/*
+* OpenCL/CUDA edition implementation of ButteraugliComparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #include <algorithm>
 #include <stdint.h>
 #include <vector>
diff --git a/clguetzli/clguetzli.cl.h b/clguetzli/clguetzli.cl.h
index 761ed634..12543e42 100644
--- a/clguetzli/clguetzli.cl.h
+++ b/clguetzli/clguetzli.cl.h
@@ -1,3 +1,10 @@
+/*
+* OpenCL/CUDA edition implementation of ButteraugliComparator.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #ifndef __CLGUETZLI_CL_H__
 #define __CLGUETZLI_CL_H__
 
diff --git a/clguetzli/clguetzli.cpp b/clguetzli/clguetzli.cpp
index be8e8c10..52129927 100644
--- a/clguetzli/clguetzli.cpp
+++ b/clguetzli/clguetzli.cpp
@@ -1,3 +1,10 @@
+/*
+* OpenCL edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #include "clguetzli.h"
 #include <math.h>
 #include <algorithm>
diff --git a/clguetzli/clguetzli.cu b/clguetzli/clguetzli.cu
index 974be98e..2b7a71c4 100644
--- a/clguetzli/clguetzli.cu
+++ b/clguetzli/clguetzli.cu
@@ -1 +1,8 @@
+/*
+* CUDA Kernels
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #include "clguetzli/clguetzli.cl"
diff --git a/clguetzli/clguetzli.h b/clguetzli/clguetzli.h
index c01da7a4..c4f3961c 100644
--- a/clguetzli/clguetzli.h
+++ b/clguetzli/clguetzli.h
@@ -1,3 +1,10 @@
+/*
+* OpenCL edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #pragma once
 #include <vector>
 #include "guetzli/processor.h"
diff --git a/clguetzli/clguetzli_test.cpp b/clguetzli/clguetzli_test.cpp
index 6e6fece8..2e5af412 100644
--- a/clguetzli/clguetzli_test.cpp
+++ b/clguetzli/clguetzli_test.cpp
@@ -1,3 +1,10 @@
+/*
+* OpenCL test cases
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #ifdef __USE_OPENCL__
 
 #include <CL/cl.h>
diff --git a/clguetzli/clguetzli_test.h b/clguetzli/clguetzli_test.h
index 94c0a2c6..dbc3c47a 100644
--- a/clguetzli/clguetzli_test.h
+++ b/clguetzli/clguetzli_test.h
@@ -1,3 +1,10 @@
+/*
+* OpenCL test cases
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #pragma once
 #include "ocl.h"
 
diff --git a/clguetzli/cuguetzli.cpp b/clguetzli/cuguetzli.cpp
index 1903c6eb..f348edb7 100644
--- a/clguetzli/cuguetzli.cpp
+++ b/clguetzli/cuguetzli.cpp
@@ -1,3 +1,10 @@
+/*
+* CUDA edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #include "cuguetzli.h"
 #include <algorithm>
 #include "ocu.h"
diff --git a/clguetzli/cuguetzli.h b/clguetzli/cuguetzli.h
index a75dcc46..8c3e3444 100644
--- a/clguetzli/cuguetzli.h
+++ b/clguetzli/cuguetzli.h
@@ -1,3 +1,10 @@
+/*
+* CUDA edition implementation of guetzli.
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*         chriskzhou@tencent.com
+*/
 #pragma once
 #include "guetzli/processor.h"
 #include "clguetzli.cl.h"
diff --git a/clguetzli/cumem_pool.cpp b/clguetzli/cumem_pool.cpp
index 4fe4964d..8252d3e7 100644
--- a/clguetzli/cumem_pool.cpp
+++ b/clguetzli/cumem_pool.cpp
@@ -1,3 +1,9 @@
+/*
+ * Memory Pool for CUDA
+ *
+ * Author: ianhuang@tencent.com
+ */
+
 #include "cumem_pool.h"
 
 #ifdef __USE_CUDA__
diff --git a/clguetzli/cumem_pool.h b/clguetzli/cumem_pool.h
index d2ceec04..b878d92f 100644
--- a/clguetzli/cumem_pool.h
+++ b/clguetzli/cumem_pool.h
@@ -1,3 +1,8 @@
+/*
+* Memory Pool for CUDA
+*
+* Author: ianhuang@tencent.com
+*/
 #pragma once
 
 #ifdef __USE_CUDA__
diff --git a/clguetzli/ocl.cpp b/clguetzli/ocl.cpp
index f4427fff..851ab943 100644
--- a/clguetzli/ocl.cpp
+++ b/clguetzli/ocl.cpp
@@ -1,3 +1,9 @@
+/*
+* OpenCL Manager
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*/
 #include "ocl.h"
 #include <string.h>
 #include <vector>
diff --git a/clguetzli/ocl.h b/clguetzli/ocl.h
index f3056dd8..7ccee2d8 100644
--- a/clguetzli/ocl.h
+++ b/clguetzli/ocl.h
@@ -1,3 +1,9 @@
+/*
+* OpenCL Manager
+*
+* Author: strongtu@tencent.com
+*         ianhuang@tencent.com
+*/
 #pragma once
 
 #ifdef __USE_OPENCL__
diff --git a/clguetzli/ocu.cpp b/clguetzli/ocu.cpp
index ea66be55..b7395ed1 100644
--- a/clguetzli/ocu.cpp
+++ b/clguetzli/ocu.cpp
@@ -1,3 +1,8 @@
+/*
+* CUDA Manager
+*
+* Author: strongtu@tencent.com
+*/
 #include "ocu.h"
 
 #ifdef __USE_CUDA__
diff --git a/clguetzli/ocu.h b/clguetzli/ocu.h
index 1c13e86e..93f675a3 100644
--- a/clguetzli/ocu.h
+++ b/clguetzli/ocu.h
@@ -1,3 +1,8 @@
+/*
+* CUDA Manager
+*
+* Author: strongtu@tencent.com
+*/
 #pragma once
 
 #ifdef __USE_CUDA__

From 808e624565f3f2165be947271256071d1d596354 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 13 Jul 2017 23:14:05 +0800
Subject: [PATCH 187/189] Update ReadMe

---
 README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/README.md b/README.md
index 2ecd1072..b316a904 100644
--- a/README.md
+++ b/README.md
@@ -99,3 +99,56 @@ attempts made.
 Please note that JPEG images do not support alpha channel (transparency). If the
 input is a PNG with an alpha channel, it will be overlaid on black background
 before encoding.
+
+# Extra features
+
+**Note:** Please make sure that you can build guetzli successfully before adding the following features.
+
+## Enable CUDA/OpenCL support
+
+**Note:** Before adding [CUDA](https://developer.nvidia.com/cuda-zone) support, please [check](http://developer.nvidia.com/cuda-gpus) whether your GPU support CUDA or not.
+
+**Note:** If you don't have an NVIDIA card that support CUDA, you can try [OpenCL](https://www.khronos.org/opencl/) instead. You can install any of the OpenCL SDKs, such as [Intel OpenCL SDK](https://software.intel.com/en-us/intel-opencl), [AMD OpenCL SDK](http://developer.amd.com/tools-and-sdks/opencl-zone/), etc.
+
+**Note:** The steps for adding OpenCL support is very similar with adding CUDA support, so the following introduction will be only for CUDA.
+
+### On POSIX systems
+1. Follow the [Installation Guide for Linux ](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Linux-pdf) to setup [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit).
+2. Edit `premake5.lua`, add `defines { "__USE_OPENCL__", "__USE_CUDA__" }` and `links { "OpenCL", "cuda" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile.
+3. Run `make` and expect the binary to be created in `bin/Release/guetzli`.
+4. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`.
+
+### On Windows
+1. Follow the [Installation Guide for Microsoft Windows](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Windows-pdf) to setup `CUDA Toolkit`.
+2. Open the Visual Studio project and edit the project `Property Pages` as follows:
+    * Add `__USE_OPENCL__` and `__USE_CUDA__` to preprocessor definitions.
+    * Add `OpenCL.lib` and `cuda.lib` to additional dependencies.
+    * Add `$(CUDA_PATH)\include` to include directories.
+    * Add `$(CUDA_PATH)\lib\Win32` or `$(CUDA_PATH)\lib\x64` to library directories.
+3. Build it.
+
+### Usage
+```bash
+guetzli [--c|--cuda|--opencl] [other options] original.png output.jpg
+guetzli [--c|--cuda|--opencl] [other options] original.jpg output.jpg
+```
+You can pass a `--c` parameter to enable the procedure optimization or `--cuda` parameter to use the CUDA acceleration or `--opencl` to use the OpenCL acceleration.
+
+If you have any question about CUDA/OpenCL support, please contact strongtu@tencent.com, ianhuang@tencent.com or chriskzhou@tencent.com.
+
+## Enable full JPEG format support
+### On POSIX systems
+1. Install [libjpeg](http://libjpeg.sourceforge.net/).
+    If using your operating system
+    package manager, install development versions of the packages if the
+    distinction exists.
+    *   On Ubuntu, do `apt-get install libjpeg8-dev`.
+    *   On Fedora, do `dnf install libjpeg-devel`. 
+    *   On Arch Linux, do `pacman -S libjpeg`.
+    *   On Alpine Linux, do `apk add libjpeg`.
+2. Edit `premake5.lua`, add `defines {"__SUPPORT_FULL_JPEG__"}` and `links { "jpeg" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile.
+3. Run `make` and expect the binary to be created in `bin/Release/guetzli`
+### On Windows
+1. Install `libjpeg-turbo` using vcpkg: `.\vcpkg install libjpeg-turbo`
+2. Open the Visual Studio project and add `__SUPPORT_FULL_JPEG__` to preprocessor definitions in the project `Property Pages`.
+3. Build it.
\ No newline at end of file

From af12f124f345849de30a4fb19fc8bd72f7f66fcf Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Tue, 18 Jul 2017 10:46:57 +0800
Subject: [PATCH 188/189] Update ReadMe & fix some mistakes

---
 README.md              | 11 +++++++----
 clguetzli/clguetzli.cl |  2 +-
 compile.sh             |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index b316a904..37fa4267 100644
--- a/README.md
+++ b/README.md
@@ -115,17 +115,20 @@ before encoding.
 ### On POSIX systems
 1. Follow the [Installation Guide for Linux ](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Linux-pdf) to setup [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit).
 2. Edit `premake5.lua`, add `defines { "__USE_OPENCL__", "__USE_CUDA__" }` and `links { "OpenCL", "cuda" }` under `filter "action:gmake"`. Then do `premake5 --os=linux gmake` to update the makefile.
-3. Run `make` and expect the binary to be created in `bin/Release/guetzli`.
-4. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`.
+3. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line.
+4. Run `make` and expect the binary to be created in `bin/Release/guetzli`.
+5. Run `./compile.sh 64` or `./compile.sh 32` to build the 64 or 32 bits [ptx](http://docs.nvidia.com/cuda/parallel-thread-execution) file, and the ptx file will be copied to `bin/Release/clguetzli`.
 
 ### On Windows
 1. Follow the [Installation Guide for Microsoft Windows](https://developer.nvidia.com/compute/cuda/8.0/Prod2/docs/sidebar/CUDA_Installation_Guide_Windows-pdf) to setup `CUDA Toolkit`.
-2. Open the Visual Studio project and edit the project `Property Pages` as follows:
+2. Copy `<vs2015 dir>\VC\bin\amd64\vcvars64.bat` as `<guetzli dir>\vcvars64.bat`
+3. Open the Visual Studio project and edit the project `Property Pages` as follows:
     * Add `__USE_OPENCL__` and `__USE_CUDA__` to preprocessor definitions.
     * Add `OpenCL.lib` and `cuda.lib` to additional dependencies.
     * Add `$(CUDA_PATH)\include` to include directories.
     * Add `$(CUDA_PATH)\lib\Win32` or `$(CUDA_PATH)\lib\x64` to library directories.
-3. Build it.
+4. Edit `clguetzli/clguetzli.cl` and add `#define __USE_OPENCL__` at first line.
+5. Build it.
 
 ### Usage
 ```bash
diff --git a/clguetzli/clguetzli.cl b/clguetzli/clguetzli.cl
index 2d18e8bd..2a8eb527 100644
--- a/clguetzli/clguetzli.cl
+++ b/clguetzli/clguetzli.cl
@@ -3417,4 +3417,4 @@ __device__ double CompareBlockFactor(const channel_info mayout_channel[3],
 #undef double
 #endif
 
-#endif __USE_OPENCL__
\ No newline at end of file
+#endif //__USE_OPENCL__
\ No newline at end of file
diff --git a/compile.sh b/compile.sh
index 0b13d464..eabb6473 100755
--- a/compile.sh
+++ b/compile.sh
@@ -4,7 +4,7 @@
 echo $1 --machine 64 or 32
 echo $2 -G
 
-nvcc -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
+nvcc -D__USE_OPENCL__ -I"./" -I"/usr/local/cuda/include" -ftz=true -prec-div=false -prec-sqrt=false -arch=compute_30 -O3 --machine $1 $2 -ptx -o clguetzli/clguetzli.cu.ptx$1  clguetzli/clguetzli.cu
 
 #copy to ./bin/Release
 cp clguetzli/clguetzli.cu.ptx$1 bin/Release/clguetzli/clguetzli.cu.ptx$1

From 14ef86d66f9caf642fa6adf2ef6c6697e17c5b25 Mon Sep 17 00:00:00 2001
From: Zhou Ke <crazyks@yeah.net>
Date: Thu, 20 Jul 2017 00:00:59 +0800
Subject: [PATCH 189/189] Update appveyor.xml

---
 appveyor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/appveyor.yml b/appveyor.yml
index 061ab6d0..97acb3ac 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -15,7 +15,7 @@ install:
   - premake5.exe %TOOLSET%
   - git clone https://github.com/Microsoft/vcpkg
   - md vcpkg\downloads\nuget-3.5.0
-  - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe
+  - appveyor DownloadFile https://dist.nuget.org/win-x86-commandline/v3.5.0/nuget.exe -FileName %appveyor_build_folder%\vcpkg\downloads\nuget-3.5.0\nuget.exe
   - appveyor DownloadFile https://cmake.org/files/v3.8/cmake-3.8.0-rc1-win32-x86.zip -FileName %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip
   - 7z x %appveyor_build_folder%\vcpkg\downloads\cmake-3.8.0-rc1-win32-x86.zip
   - cd vcpkg