diff --git a/pencil-benchmarks-imageproc/hog/HogDescriptor.cl b/pencil-benchmarks-imageproc/hog/HogDescriptor.cl
index 52453f4..583ec62 100644
--- a/pencil-benchmarks-imageproc/hog/HogDescriptor.cl
+++ b/pencil-benchmarks-imageproc/hog/HogDescriptor.cl
@@ -17,7 +17,7 @@
 #error STATIC_BLOCK_SIZE not defined
 #endif
 #if (USE_GLOBAL + USE_LOCAL + USE_PRIVATE != 1) 
-#error define one of one of USE_PRIVATE, USE_LOCAL, USE_GLOBAL to 1 to store intermediate histogram data in private, local, or global memory
+#error define one of USE_PRIVATE, USE_LOCAL, USE_GLOBAL to 1 to store intermediate histogram data in private, local, or global memory
 #endif
 
 #if VECTOR_SIZE == 16
@@ -128,7 +128,7 @@ void atomic_add_float(volatile LOCAL float* source, float operand) {
 #undef LOCAL
 #endif
 
-kernel void FUNCTIONNAME(        const          int2                    img_size
+kernel void KERNEL_NAME (        const          int2                    img_size
                         ,        const unsigned int                     step_
                         , global const unsigned char   * const restrict image
                         ,        const unsigned int                     num_locations
diff --git a/pencil-benchmarks-imageproc/hog/HogDescriptor.h b/pencil-benchmarks-imageproc/hog/HogDescriptor.h
index 08bce49..7d23e50 100644
--- a/pencil-benchmarks-imageproc/hog/HogDescriptor.h
+++ b/pencil-benchmarks-imageproc/hog/HogDescriptor.h
@@ -7,6 +7,10 @@
 #define __CL_ENABLE_EXCEPTIONS
 #include <CL/cl.hpp>
 
+#ifndef VECTOR_SIZE
+#define VECTOR_SIZE 4
+#endif
+
 namespace nel {
     template<int numberOfCells, int numberOfBins, bool gauss, bool spinterp, bool _signed, bool static_>
     class HOGDescriptorCPP {
@@ -85,7 +89,7 @@ namespace nel {
         static size_t getNumberOfBins();
 
     private:
-        enum HOGAlgorithmType { use_private, use_local, use_global };
+        enum HOGAlgorithmType { use_private = 0, use_local = 1, use_global = 2 };
         HOGAlgorithmType getAlgorithmType() const;
 
         cl::Device       m_device;
@@ -99,7 +103,7 @@ namespace nel {
 
         bool m_has_local_memory;
 
-        static const size_t ms_calchog_vector_size = 4;
+        static const size_t ms_calc_hog_vector_size = VECTOR_SIZE;
     };
 }
 
diff --git a/pencil-benchmarks-imageproc/hog/HogDescriptor.hpp b/pencil-benchmarks-imageproc/hog/HogDescriptor.hpp
index 17122d6..738d5a8 100644
--- a/pencil-benchmarks-imageproc/hog/HogDescriptor.hpp
+++ b/pencil-benchmarks-imageproc/hog/HogDescriptor.hpp
@@ -7,6 +7,28 @@
     #include <tbb/blocked_range.h>
 #endif
 
+#if (1 == XOPENME)
+#include <xopenme.h>
+#endif
+
+#ifndef M_1_PI
+#define M_1_PI 3.14159265358979323846
+#endif
+
+#ifndef SELECT_CL_DEVICE_TYPE
+#define SELECT_CL_DEVICE_TYPE CL_DEVICE_TYPE_GPU
+#endif
+
+#ifndef HOG_CL
+#define HOG_CL "HogDescriptor.cl"
+#endif
+
+#ifndef USE_PRIVATE_THRESHOLD
+// The default that uses private for 2x2 HOG with <= 9 bins,
+// but switches to local or global for 4x4 HOG with > 2 bins.
+#define USE_PRIVATE_THRESHOLD (36)
+#endif
+
 namespace {
     template<typename T>
     inline int fast_floor(T f) {
@@ -102,8 +124,10 @@ cv::Mat_<float> nel::HOGDescriptorCPP<numberOfCells, numberOfBins, gauss, spinte
 
         // compute edges, magnitudes, orientations and the histogram
         for (int pointy = minyi; pointy <= maxyi; pointy++) {
+#if defined (__GNUC__)
             #pragma GCC diagnostic push
             #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
             int cellyi;
             float yscale0;
             float yscale1;
@@ -187,7 +211,9 @@ cv::Mat_<float> nel::HOGDescriptorCPP<numberOfCells, numberOfBins, gauss, spinte
                     }
                 }
             }
+#if defined (__GNUC__)
             #pragma GCC diagnostic pop
+#endif
         }
         cv::Mat_<float> destRow = descriptors.row(n);
         cv::Mat(hist).reshape(0,1).copyTo(destRow);
@@ -201,12 +227,16 @@ cv::Mat_<float> nel::HOGDescriptorCPP<numberOfCells, numberOfBins, gauss, spinte
 template<size_t numberOfCells, size_t numberOfBins, bool gauss, bool spinterp, bool _signed, bool _static>
 typename nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _static>::HOGAlgorithmType
 nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _static>::getAlgorithmType() const {
-    if (getNumberOfBins() <= 36)
+#ifdef HOG_TYPE
+    return ((HOGAlgorithmType) HOG_TYPE);
+#else
+    if (getNumberOfBins() <= USE_PRIVATE_THRESHOLD)
         return use_private;
     if (m_has_local_memory)
         return use_local;
     else
         return use_global;
+#endif
 }
 
 template<size_t numberOfCells, size_t numberOfBins, bool gauss, bool spinterp, bool _signed, bool _static>
@@ -216,8 +246,9 @@ nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _st
 
     //Get the used device
     std::vector<cl::Device> devices;
-    cl::Platform::getDefault().getDevices(CL_DEVICE_TYPE_GPU, &devices);
+    cl::Platform::getDefault().getDevices(SELECT_CL_DEVICE_TYPE, &devices);
     m_device = devices.at(0);
+    std::cout << "Device: " << m_device.getInfo<CL_DEVICE_NAME>() << std::endl;
 
     //Create context
     m_context = cl::Context(m_device);
@@ -225,12 +256,12 @@ nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _st
     m_has_local_memory = (CL_LOCAL == m_device.getInfo<CL_DEVICE_LOCAL_MEM_TYPE>());
 
     //Load source
-    std::ifstream source_file{ "HogDescriptor.cl" };
+    std::ifstream source_file{ HOG_CL };
     std::string source{ std::istreambuf_iterator<char>{source_file}, std::istreambuf_iterator<char>{} };
 
     //Create program
     cl::Program program(m_context, source);
-    std::string functionname = "hog" + std::to_string(numberOfCells) + 'x' + std::to_string(numberOfCells);
+    std::string kernel_name = "hog" + std::to_string(numberOfCells) + 'x' + std::to_string(numberOfCells);
 
     //Build options
     std::stringstream build_opts;
@@ -238,7 +269,9 @@ nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _st
     build_opts << " -cl-denorms-are-zero";
     build_opts << " -cl-mad-enable";
     build_opts << " -cl-no-signed-zeros";
+#if defined (CL_FINITE_MATH_ONLY)
     build_opts << " -cl-finite-math-only";
+#endif
     build_opts << " -Werror";
     build_opts << " -D NUMBER_OF_CELLS="  << numberOfCells;
     build_opts << " -D NUMBER_OF_BINS="   << numberOfBins;
@@ -246,17 +279,23 @@ nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _st
     build_opts << " -D SPARTIAL_WEIGHTS=" << (spinterp ? "1" : "0");
     build_opts << " -D SIGNED_HOG="       << (_signed  ? "1" : "0");
     build_opts << " -D STATIC_BLOCK_SIZE="<< (_static  ? "1" : "0");
-    build_opts << " -D FUNCTIONNAME="     << functionname;
-    build_opts << " -D VECTOR_SIZE="      << ms_calchog_vector_size;
+    build_opts << " -D KERNEL_NAME="      << kernel_name;
+    build_opts << " -D VECTOR_SIZE="      << ms_calc_hog_vector_size;
     switch (getAlgorithmType()) {
         case use_private: build_opts << " -D USE_PRIVATE=1"; break;
         case use_local  : build_opts << " -D USE_LOCAL=1";   break;
-        case use_global: default: break;
+        case use_global:  build_opts << " -D USE_GLOBAL=1";  break;
+        default: assert(0 && "Unsupported algorithm type!");
     }
 #ifndef NDEBUG
     build_opts << " -D DEBUG";
 #endif
 
+#if (1 == XOPENME)
+    xopenme_add_var_s(11, (char*) "  \"kernel_name\":\"%s\"", (void*) kernel_name.c_str());
+    xopenme_add_var_s(12, (char*) "  \"kernel_build_options\":\"%s\"", (void*) build_opts.str().c_str());
+#endif
+
     //Build program
     try {
         program.build(build_opts.str().c_str());
@@ -270,7 +309,7 @@ nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinterp, _signed, _st
     m_queue = cl::CommandQueue(m_context, m_device, CL_QUEUE_PROFILING_ENABLE);
 
     //Query kernels and work group infos
-    m_calc_hog = cl::Kernel(program, functionname.c_str());
+    m_calc_hog = cl::Kernel(program, kernel_name.c_str());
     m_calc_hog_preferred_multiple = m_calc_hog.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(m_device);
     m_calc_hog_group_size         = m_calc_hog.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(m_device);
 }
@@ -316,16 +355,22 @@ cv::Mat_<float> nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinte
     m_queue.enqueueWriteBuffer( locations_cl, CL_FALSE, 0,  locations_bytes,  locations.data);
     auto blocksizes_cl = blocksizes.convert_to_opencl_data(m_context, m_queue);   //If dynamic: creates cl::Buffer like the rest. If static: creates a functor that returns the static value as cl_float
 
+
+#define DEFAULT_LWS_X 4
+#define DEFAULT_LWS_Y 8
+#define DEFAULT_LWS_Z std::min( std::max<size_t>(num_locations / m_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1), m_calc_hog_group_size / (DEFAULT_LWS_X * DEFAULT_LWS_Y))
+
 #ifndef LWS_X
-#define LWS_X 4
+#define LWS_X DEFAULT_LWS_X
 #endif
 
 #ifndef LWS_Y
-#define LWS_Y 8
+#define LWS_Y DEFAULT_LWS_Y
 #endif
 
+// This allows tuning when only one of the dimensions LWS_X and LWS_Y changes.
 #ifndef LWS_Z
-#define LWS_Z std::min( std::max<size_t>(num_locations / m_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1), m_calc_hog_group_size / (lws_x * lws_y))
+#define LWS_Z std::min( std::max<size_t>(num_locations / m_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1), m_calc_hog_group_size / (LWS_X * LWS_Y))
 #endif
     {
         //Figure out the work group sizes
@@ -341,6 +386,20 @@ cv::Mat_<float> nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinte
 
         cl_int2 image_size = { image.cols, image.rows };
 
+#if (1 == XOPENME)
+        xopenme_add_var_i(13, (char*) "  \"default_lws_x\":%u", DEFAULT_LWS_X);
+        xopenme_add_var_i(14, (char*) "  \"default_lws_y\":%u", DEFAULT_LWS_Y);
+        xopenme_add_var_i(15, (char*) "  \"default_lws_z\":%u", DEFAULT_LWS_Z);
+
+        xopenme_add_var_i(16, (char*) "  \"lws_x\":%u", lws_x);
+        xopenme_add_var_i(17, (char*) "  \"lws_y\":%u", lws_y);
+        xopenme_add_var_i(18, (char*) "  \"lws_z\":%u", lws_z);
+
+        xopenme_add_var_i(19, (char*) "  \"gws_x\":%u", gws_x);
+        xopenme_add_var_i(20, (char*) "  \"gws_y\":%u", gws_y);
+        xopenme_add_var_i(21, (char*) "  \"gws_z\":%u", gws_z);
+#endif
+
         //Execute the kernel
         {
             //ADD MULTITHREAD LOCK HERE (if needed)
@@ -353,7 +412,7 @@ cv::Mat_<float> nel::HOGDescriptorOCL<numberOfCells, numberOfBins, gauss, spinte
             m_calc_hog.setArg(6, (cl_mem )descriptor_cl());
             switch (getAlgorithmType()) {
             case use_private:
-                m_calc_hog.setArg(7, (size_t)(sizeof(cl_float) * lws_x * lws_y * lws_z * ms_calchog_vector_size), nullptr);
+                m_calc_hog.setArg(7, (size_t)(sizeof(cl_float) * lws_x * lws_y * lws_z * ms_calc_hog_vector_size), nullptr);
                 break;
             case use_local:
                 m_calc_hog.setArg(7, (size_t)(sizeof(cl_float) * getNumberOfBins() * lws_z), nullptr);
diff --git a/pencil-benchmarks-imageproc/hog/test_hog.cpp b/pencil-benchmarks-imageproc/hog/test_hog.cpp
index a9c88f5..b3d2ee3 100644
--- a/pencil-benchmarks-imageproc/hog/test_hog.cpp
+++ b/pencil-benchmarks-imageproc/hog/test_hog.cpp
@@ -1,9 +1,22 @@
+//CK - a temporary workaround for the autotuning compiler_vars issue.
+#define HOG4X4 1
+#define BLOCK_SIZE 64
+
+//CK
+#ifdef WINDOWS
+#include <numeric>
+#include <random>
+#define M_PI 3.14159265358979323846
+#endif
+#if (1 == XOPENME)
+#include <xopenme.h>
+#endif
+
 #include "utility.hpp"
 #include "HogDescriptor.h"
 
 #include <opencv2/imgproc/imgproc.hpp>
 
-
 #ifndef EXCLUDE_PENCIL_TEST
 #include <prl.h>
 #include "hog.pencil.h"
@@ -12,30 +25,27 @@
 #define prl_shutdown(x)
 #endif
 
-//Useful set 1:
-//#define NUMBER_OF_CELLS 4
-//#define NUMBER_OF_BINS 8
-//#define GAUSSIAN_WEIGHTS 1
-//#define SPARTIAL_WEIGHTS 1
-//#define SIGNED_HOG 1
-//#define STATIC_HOG 1
-
-//Useful set 2:
-//#define NUMBER_OF_CELLS 2
-//#define NUMBER_OF_BINS 4
-//#define GAUSSIAN_WEIGHTS 1
-//#define SPARTIAL_WEIGHTS 1
-//#define SIGNED_HOG 0
-//#define STATIC_HOG 0
-
-#ifndef BLOCK_SIZE
-#define BLOCK_SIZE 32
+//Useful set 1.
+#if (1 == HOG4X4)
+#define NUMBER_OF_CELLS 4
+#define NUMBER_OF_BINS 8
+#define GAUSSIAN_WEIGHTS 1
+#define SPARTIAL_WEIGHTS 1
+#define SIGNED_HOG 1
+#define STATIC_HOG 1
 #endif
 
-#ifndef NUMBER_OF_LOCATIONS
-#define NUMBER_OF_LOCATIONS 49
+//Useful set 2.
+#if (1 == HOG2X2)
+#define NUMBER_OF_CELLS 2
+#define NUMBER_OF_BINS 4
+#define GAUSSIAN_WEIGHTS 1
+#define SPARTIAL_WEIGHTS 1
+#define SIGNED_HOG 0
+#define STATIC_HOG 0
 #endif
 
+//Defaults as for HOG2X2.
 #ifndef NUMBER_OF_CELLS
 #define NUMBER_OF_CELLS 2
 #endif
@@ -60,6 +70,19 @@
 #define STATIC_HOG 0
 #endif
 
+#ifndef NUMBER_OF_LOCATIONS
+#define NUMBER_OF_LOCATIONS 49
+#endif
+
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE 32
+#endif
+
+#ifndef VECTOR_SIZE
+#define VECTOR_SIZE 4
+#endif
+
+
 void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>& sizes, int num_positions, int repeat )
 {
     carp::Timing timing("HOG");
@@ -73,17 +96,30 @@ void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>
             for ( auto & item : pool ) {
                 std::mt19937 rng(1);   //uses same seed, reseed for all iteration
 
+                cv::Mat cpu_orig = item.cpuimg();
                 cv::Mat cpu_gray;
-                cv::cvtColor( item.cpuimg(), cpu_gray, CV_RGB2GRAY );
+                if (cpu_orig.empty()) {
+                    std::cerr << "  empty image..." << std::endl;
+                    break;
+                } else if (cpu_orig.channels() <= 1) {
+                    cpu_gray = cpu_orig;
+                } else {
+                    cv::cvtColor( cpu_orig, cpu_gray, CV_RGB2GRAY );
+                }
+
                 std::cout << "image path: " << item.path()   << std::endl;
                 std::cout << "image rows: " << cpu_gray.rows << std::endl;
                 std::cout << "image cols: " << cpu_gray.cols << std::endl;
+#if (1 == XOPENME)
+                xopenme_add_var_i( 9, (char*) "  \"input_size_x\":%u", cpu_gray.rows);
+                xopenme_add_var_i(10, (char*) "  \"input_size_y\":%u", cpu_gray.cols);
+#endif
 
                 cv::Mat_<float> locations(num_positions, 2);
-#if !STATIC_HOG
-                cv::Mat_<float> blocksizes(num_positions, 2);
-#else
+#if (1 == STATIC_HOG)
                 float blocksizes = size;
+#else
+                cv::Mat_<float> blocksizes(num_positions, 2);
 #endif
                 //fill locations and blocksizes
                 std::uniform_real_distribution<float> genx(size/2+1, cpu_gray.cols-1-size/2-1);
@@ -91,7 +127,7 @@ void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>
                 for( int i = 0; i < num_positions; ++i) {
                     locations(i, 0) = genx(rng);
                     locations(i, 1) = geny(rng);
-#if !STATIC_HOG
+#if (1 != STATIC_HOG)
                     blocksizes(i, 0) = size;
                     blocksizes(i, 1) = size;
 #endif
@@ -135,7 +171,7 @@ void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>
 
                     if (first_execution_pencil)
                     {
-#if STATIC_HOG
+#if (1 == STATIC_HOG)
                         pencil_hog_static (
 #else
                         pencil_hog_dynamic(
@@ -144,7 +180,7 @@ void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>
                               , cpu_gray.rows, cpu_gray.cols, cpu_gray.step1(), cpu_gray.ptr<uint8_t>()
                               , num_positions
                               , reinterpret_cast<const float (*)[2]>(locations.data)
-#if STATIC_HOG
+#if (1 == STATIC_HOG)
                               , blocksizes
 #else
                               , reinterpret_cast<const float (*)[2]>(blocksizes.data)
@@ -170,6 +206,21 @@ void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>
                     prl_timings_dump();
                 }
 #endif
+
+#if (1 == XOPENME_DUMP_IMAGES)
+#ifdef WINDOWS
+                IplImage tmp;
+                tmp = cpu_result;
+                cvSaveImage("tmp-ck-state-hog-cpu.png", &tmp);
+                tmp = gpu_result;
+                cvSaveImage("tmp-ck-state-hog-gpu.png", &tmp);
+#else
+                cv::imwrite("tmp-ck-state-hog-cpu.png", cpu_result );
+                cv::imwrite("tmp-ck-state-hog-gpu.png", gpu_result );
+                cv::imwrite("tmp-ck-state-hog-diff-cpu-gpu.png", cv::abs(cpu_result-gpu_result) );
+#endif
+#endif
+
                 // Verifying the results
                 if ( cv::norm( cpu_result, gpu_result, cv::NORM_INF) > cv::norm( gpu_result, cv::NORM_INF)*1e-5 )
                 {
@@ -203,6 +254,24 @@ void time_hog( const std::vector<carp::record_t>& pool, const std::vector<float>
 
 int main(int argc, char* argv[])
 {
+
+#if (1 == XOPENME)
+    xopenme_init(1, 22);
+    xopenme_clock_start(0);
+
+    xopenme_add_var_i(0, (char*) "  \"number_of_cells\":%u"     , NUMBER_OF_CELLS     );
+    xopenme_add_var_i(1, (char*) "  \"number_of_bins\":%u"      , NUMBER_OF_BINS      );
+    xopenme_add_var_i(2, (char*) "  \"gaussian_weights\":%u"    , GAUSSIAN_WEIGHTS    );
+    xopenme_add_var_i(3, (char*) "  \"spartial_weights\":%u"    , SPARTIAL_WEIGHTS    );
+    xopenme_add_var_i(4, (char*) "  \"signed_hog\":%u"          , SIGNED_HOG          );
+    xopenme_add_var_i(5, (char*) "  \"static_hog\":%u"          , STATIC_HOG          );
+    xopenme_add_var_i(6, (char*) "  \"number_of_locations\":%u" , NUMBER_OF_LOCATIONS );
+    xopenme_add_var_i(7, (char*) "  \"block_size\":%u"          , BLOCK_SIZE          );
+    xopenme_add_var_i(8, (char*) "  \"vector_size\":%u"         , VECTOR_SIZE         );
+#endif
+
+    int exit_code = EXIT_SUCCESS;
+
     try
     {
         prl_init((prl_init_flags)(PRL_TARGET_DEVICE_DYNAMIC | PRL_PROFILING_ENABLED));
@@ -218,13 +287,23 @@ int main(int argc, char* argv[])
 
         prl_shutdown();
 
-        return EXIT_SUCCESS;
+        exit_code = EXIT_SUCCESS;
     }
     catch(const std::exception& e)
     {
         std::cout << e.what() << std::endl;
 
         prl_shutdown();
-        return EXIT_FAILURE;
+
+        exit_code = EXIT_FAILURE;
     }
+
+#if (1 == XOPENME)
+    xopenme_clock_end(0);
+    xopenme_dump_state();
+    xopenme_finish();
+#endif
+
+    return exit_code;
+
 } // main
diff --git a/pencil-benchmarks-imageproc/include/utility.hpp b/pencil-benchmarks-imageproc/include/utility.hpp
index 5f1b6ce..2416933 100644
--- a/pencil-benchmarks-imageproc/include/utility.hpp
+++ b/pencil-benchmarks-imageproc/include/utility.hpp
@@ -49,7 +49,11 @@ private:
 
 public:
     cv::Mat cpuimg() const {
+#ifdef WINDOWS
+        return cvLoadImage(m_path);
+#else
         return cv::imread(m_path);
+#endif
     }
 
     std::string path() const {
